Merge commit '3ff195b011d7decf501a4d55aeed312731094796' into for-linus

Conflicts: drivers/md/md.c - Resolved conflict in md_update_sb - Added extra 'NULL' arg to new instance of sysfs_get_dirent. Signed-off-by: NeilBrown <neilb@suse.de>
author: NeilBrown <neilb@suse.de> 2010-05-21 18:31:36 -0400
committer: NeilBrown <neilb@suse.de> 2010-05-21 18:31:36 -0400
commit: 19fdb9eefb21b72edbc365b838502780c392bad6 (patch)
tree: deae04c48532d6eab64ed4b0396737bb854b5506 /fs
parent: be6800a73aa2f3dc14744c3b80e676d189789f04 (diff)
parent: 3ff195b011d7decf501a4d55aeed312731094796 (diff)
625 files changed, 40541 insertions, 10026 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index e777961939f3..0dbe0d139ac2 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -22,6 +22,7 @@
 #include <linux/jiffies.h>
 #include <linux/file.h>
+#include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 08b2eb157048..7317b39b2815 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/idr.h>
 #include <net/9p/9p.h>
@@ -110,7 +111,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 {
        int i, n, l, clone, any, access;
        u32 uid;
-        struct p9_fid *fid;
+        struct p9_fid *fid, *old_fid = NULL;
        struct dentry *d, *ds;
        struct v9fs_session_info *v9ses;
        char **wnames, *uname;
@@ -183,10 +184,18 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
                l = min(n - i, P9_MAXWELEM);
                fid = p9_client_walk(fid, l, &wnames[i], clone);
                if (IS_ERR(fid)) {
+                        if (old_fid) {
+                                /*
+                                 * If we fail, clunk fid which are mapping
+                                 * to path component and not the last component
+                                 * of the path.
+                                 */
+                                p9_client_clunk(old_fid);
+                        }
                        kfree(wnames);
                        return fid;
                }
+                old_fid = fid;
                i += l;
                clone = 0;
        }
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6c7f6a251115..f8b86e92cd66 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,6 +29,7 @@
 #include <linux/sched.h>
 #include <linux/parser.h>
 #include <linux/idr.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include <net/9p/transport.h>
@@ -237,11 +238,18 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                return ERR_PTR(-ENOMEM);
        }
+        rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
+        if (rc) {
+                __putname(v9ses->aname);
+                __putname(v9ses->uname);
+                return ERR_PTR(rc);
+        }
        spin_lock(&v9fs_sessionlist_lock);
        list_add(&v9ses->slist, &v9fs_sessionlist);
        spin_unlock(&v9fs_sessionlist_lock);
-        v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER;
+        v9ses->flags = V9FS_ACCESS_USER;
        strcpy(v9ses->uname, V9FS_DEFUSER);
        strcpy(v9ses->aname, V9FS_DEFANAME);
        v9ses->uid = ~0;
@@ -262,8 +270,10 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                goto error;
        }
-        if (!p9_is_proto_dotu(v9ses->clnt))
+        if (p9_is_proto_dotl(v9ses->clnt))
-                v9ses->flags &= ~V9FS_PROTO_2000U;
+                v9ses->flags |= V9FS_PROTO_2000L;
+        else if (p9_is_proto_dotu(v9ses->clnt))
+                v9ses->flags |= V9FS_PROTO_2000U;
        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
@@ -298,6 +308,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        return fid;
 error:
+        bdi_destroy(&v9ses->bdi);
        return ERR_PTR(retval);
 }
@@ -323,6 +334,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
        __putname(v9ses->uname);
        __putname(v9ses->aname);
+        bdi_destroy(&v9ses->bdi);
        spin_lock(&v9fs_sessionlist_lock);
        list_del(&v9ses->slist);
        spin_unlock(&v9fs_sessionlist_lock);
@@ -340,6 +353,19 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
        p9_client_disconnect(v9ses->clnt);
 }
+/**
+ * v9fs_session_begin_cancel - Begin terminate of a session
+ * @v9ses: session to terminate
+ *
+ * After this call we don't allow any request other than clunk.
+ */
+void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
+{
+        P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
+        p9_client_begin_disconnect(v9ses->clnt);
+}
 extern int v9fs_error_init(void);
 static struct kobject *v9fs_kobj;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 79000bf62491..bec4d0bcb458 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,11 +20,12 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#include <linux/backing-dev.h>
 /**
 * enum p9_session_flags - option flags for each 9P session
 * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
- * @V9FS_PROTO_2010L: whether or not to use 9P2010.l extensions
+ * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
 * @V9FS_ACCESS_ANY: use a single attach for all users
@@ -34,7 +35,7 @@
 */
 enum p9_session_flags {
        V9FS_PROTO_2000U        = 0x01,
-        V9FS_PROTO_2010L        = 0x02,
+        V9FS_PROTO_2000L        = 0x02,
        V9FS_ACCESS_SINGLE      = 0x04,
        V9FS_ACCESS_USER        = 0x08,
        V9FS_ACCESS_ANY         = 0x0C,
@@ -102,12 +103,14 @@ struct v9fs_session_info {
        u32 uid;                /* if ACCESS_SINGLE, the uid that has access */
        struct p9_client *clnt; /* 9p client */
        struct list_head slist; /* list of sessions registered with v9fs */
+        struct backing_dev_info bdi;
 };
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
                                                                        char *);
 void v9fs_session_close(struct v9fs_session_info *v9ses);
 void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
 #define V9FS_MAGIC 0x01021997
@@ -130,5 +133,5 @@ static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
 static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 {
-        return v9ses->flags & V9FS_PROTO_2010L;
+        return v9ses->flags & V9FS_PROTO_2000L;
 }
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index d74325295b1e..cbf4e50f3933 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -34,6 +34,7 @@
 #include <linux/namei.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 6580aa449541..0adfd64dfcee 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -32,6 +32,7 @@
 #include <linux/sched.h>
 #include <linux/inet.h>
 #include <linux/idr.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -76,6 +77,15 @@ static inline int dt_type(struct p9_wstat *mistat)
        return rettype;
 }
+static void p9stat_init(struct p9_wstat *stbuf)
+{
+        stbuf->name  = NULL;
+        stbuf->uid   = NULL;
+        stbuf->gid   = NULL;
+        stbuf->muid  = NULL;
+        stbuf->extension = NULL;
+}
 /**
 * v9fs_dir_readdir - read a directory
 * @filp: opened file structure
@@ -121,6 +131,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
        rdir = (struct p9_rdir *) fid->rdir;
        err = mutex_lock_interruptible(&rdir->mutex);
+        if (err)
+                return err;
        while (err == 0) {
                if (rdir->tail == rdir->head) {
                        err = v9fs_file_readn(filp, rdir->buf, NULL,
@@ -131,8 +143,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        rdir->head = 0;
                        rdir->tail = err;
                }
                while (rdir->head < rdir->tail) {
+                        p9stat_init(&st);
                        err = p9stat_read(rdir->buf + rdir->head,
                                                buflen - rdir->head, &st,
                                                fid->clnt->proto_version);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 36122683fae8..df52d488d2a6 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -114,7 +114,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
        P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
        /* No mandatory locks */
-        if (__mandatory_lock(inode))
+        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
                return -ENOLCK;
        if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
@@ -215,7 +215,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
        struct p9_fid *fid;
        struct p9_client *clnt;
        struct inode *inode = filp->f_path.dentry->d_inode;
-        int origin = *offset;
+        loff_t origin = *offset;
        unsigned long pg_start, pg_end;
        P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5fe45d692c9f..f2434fc9d2c4 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -34,6 +34,7 @@
 #include <linux/namei.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -431,6 +432,7 @@ error:
 static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 {
+        int retval;
        struct inode *file_inode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *v9fid;
@@ -444,7 +446,10 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
        if (IS_ERR(v9fid))
                return PTR_ERR(v9fid);
-        return p9_client_remove(v9fid);
+        retval = p9_client_remove(v9fid);
+        if (!retval)
+                drop_nlink(file_inode);
+        return retval;
 }
 static int
@@ -656,6 +661,9 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
                dir, dentry->d_name.name, dentry, nameidata);
+        if (dentry->d_name.len > NAME_MAX)
+                return ERR_PTR(-ENAMETOOLONG);
        sb = dir->i_sb;
        v9ses = v9fs_inode2v9ses(dir);
        dfid = v9fs_fid_lookup(dentry->d_parent);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 69357c0d9899..806da5d3b3a0 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,7 @@
 #include <linux/mount.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -76,6 +77,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        sb->s_blocksize = 1 << sb->s_blocksize_bits;
        sb->s_magic = V9FS_MAGIC;
        sb->s_op = &v9fs_super_ops;
+        sb->s_bdi = &v9ses->bdi;
        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
            MS_NOATIME;
@@ -193,6 +195,7 @@ static void v9fs_kill_super(struct super_block *s)
        kill_anon_super(s);
+        v9fs_session_cancel(v9ses);
        v9fs_session_close(v9ses);
        kfree(v9ses);
        s->s_fs_info = NULL;
@@ -205,7 +208,7 @@ v9fs_umount_begin(struct super_block *sb)
        struct v9fs_session_info *v9ses;
        v9ses = sb->s_fs_info;
-        v9fs_session_cancel(v9ses);
+        v9fs_session_begin_cancel(v9ses);
 }
 static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index 7405f071be67..5f85b5947613 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config NFS_COMMON
 source "net/sunrpc/Kconfig"
 source "fs/smbfs/Kconfig"
+source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
 source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index c3633aa46911..97f340f14ba2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)          += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_EXOFS_FS)          += exofs/
+obj-$(CONFIG_CEPH_FS)           += ceph/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 6910a98bd73c..4a3af7075c1d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/parser.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/statfs.h>
 #include "adfs.h"
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index dc5ef14bdc1c..3e262711ae06 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -7,6 +7,7 @@
 *  block allocation, deallocation, calculation of free space.
 */
+#include <linux/slab.h>
 #include "affs.h"
 /* This is, of course, shamelessly stolen from fs/minix */
@@ -128,7 +129,7 @@ err_range:
 /*
 * Allocate a block in the given allocation zone.
 * Since we have to byte-swap the bitmap on little-endian
- * machines, this is rather expensive. Therefor we will
+ * machines, this is rather expensive. Therefore we will
 * preallocate up to 16 blocks from the same word, if
 * possible. We are not doing preallocations in the
 * header zone, though.
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index c9744d771d98..f4b2a4ee4f91 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -10,6 +10,7 @@
 *  (C) 1991  Linus Torvalds - minix filesystem
 */
 #include <linux/sched.h>
+#include <linux/gfp.h>
 #include "affs.h"
 extern const struct inode_operations affs_symlink_inode_operations;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d41e9673cd97..16a3e4765f68 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -17,6 +17,7 @@
 #include <linux/magic.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "affs.h"
 extern struct timezone sys_tz;
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index e2b1d3f16519..0fb315dd4d2a 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -9,7 +9,6 @@
 * 2 of the License, or (at your option) any later version.
 */
-#include <linux/slab.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index eb765489164f..a3bcec75c54a 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/ip.h>
 #include "internal.h"
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 88067f36e5e7..adc1cb771b57 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 39b301662f22..0df9bc2b724d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -12,10 +12,10 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
+#include <linux/gfp.h>
 #include "internal.h"
 static int afs_readpage(struct file *file, struct page *page);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 023b95b0d9d7..4bd0218473a9 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -10,6 +10,7 @@
 */
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/circ_buf.h>
 #include "internal.h"
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c048f0658751..d00b312e3110 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -16,7 +16,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/sched.h>
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index c54dad4e6063..a10f2582844f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -19,6 +19,7 @@
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/fscache.h>
+#include <linux/backing-dev.h>
 #include "afs.h"
 #include "afs_vl.h"
@@ -313,6 +314,7 @@ struct afs_volume {
        unsigned short          rjservers;      /* number of servers discarded due to -ENOMEDIUM */
        struct afs_server       *servers[8];    /* servers on which volume resides (ordered) */
        struct rw_semaphore     server_sem;     /* lock for accessing current server */
+        struct backing_dev_info bdi;
 };
 /*
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 5ffb570cd3a8..b3feddc4f7d6 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -12,11 +12,11 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/gfp.h>
 #include "internal.h"
@@ -138,9 +138,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 {
        struct afs_super_info *super;
        struct vfsmount *mnt;
-        struct page *page = NULL;
+        struct page *page;
        size_t size;
-        char *buf, *devname = NULL, *options = NULL;
+        char *buf, *devname, *options;
        int ret;
        _enter("{%s}", mntpt->d_name.name);
@@ -150,22 +150,22 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        ret = -EINVAL;
        size = mntpt->d_inode->i_size;
        if (size > PAGE_SIZE - 1)
-                goto error;
+                goto error_no_devname;
        ret = -ENOMEM;
        devname = (char *) get_zeroed_page(GFP_KERNEL);
        if (!devname)
-                goto error;
+                goto error_no_devname;
        options = (char *) get_zeroed_page(GFP_KERNEL);
        if (!options)
-                goto error;
+                goto error_no_options;
        /* read the contents of the AFS special symlink */
        page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
        if (IS_ERR(page)) {
                ret = PTR_ERR(page);
-                goto error;
+                goto error_no_page;
        }
        ret = -EIO;
@@ -196,12 +196,12 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        return mnt;
 error:
-        if (page)
+        page_cache_release(page);
-                page_cache_release(page);
+error_no_page:
-        if (devname)
+        free_page((unsigned long) options);
-                free_page((unsigned long) devname);
+error_no_options:
-        if (options)
+        free_page((unsigned long) devname);
-                free_page((unsigned long) options);
+error_no_devname:
        _leave(" = %d", ret);
        return ERR_PTR(ret);
 }
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bde3f19c0995..67cf810e0fd6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -9,6 +9,7 @@
 * 2 of the License, or (at your option) any later version.
 */
+#include <linux/slab.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <rxrpc/packet.h>
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
        if (!permits)
                goto out_unlock;
-        memcpy(permits->permits, xpermits->permits,
+        if (xpermits)
-               count * sizeof(struct afs_permit));
+                memcpy(permits->permits, xpermits->permits,
+                        count * sizeof(struct afs_permit));
        _debug("key %x access %x",
               key_serial(key), vnode->status.caller_access);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 14f6431598ad..e932e5a3a0c1 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -311,6 +311,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
        sb->s_magic             = AFS_FS_MAGIC;
        sb->s_op                = &afs_super_ops;
        sb->s_fs_info           = as;
+        sb->s_bdi               = &as->volume->bdi;
        /* allocate the root inode and dentry */
        fid.vid         = as->volume->vid;
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 36c1306e09e0..340afd0cd182 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -9,6 +9,7 @@
 * 2 of the License, or (at your option) any later version.
 */
+#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 6e689208def2..9ac260d1361d 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 2f05c4fc2a70..25cf4c3f4ff7 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index a353e69e2391..401eeb21869f 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,6 +106,10 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
        volume->cell            = params->cell;
        volume->vid             = vlocation->vldb.vid[params->type];
+        ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
+        if (ret)
+                goto error_bdi;
        init_rwsem(&volume->server_sem);
        /* look up all the applicable server records */
@@ -151,6 +155,8 @@ error:
        return ERR_PTR(ret);
 error_discard:
+        bdi_destroy(&volume->bdi);
+error_bdi:
        up_write(&params->cell->vl_sem);
        for (loop = volume->nservers - 1; loop >= 0; loop--)
@@ -200,6 +206,7 @@ void afs_put_volume(struct afs_volume *volume)
        for (loop = volume->nservers - 1; loop >= 0; loop--)
                afs_put_server(volume->servers[loop]);
+        bdi_destroy(&volume->bdi);
        kfree(volume);
        _leave(" [destroyed]");
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9f0bf13291e5..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -12,7 +12,6 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -209,6 +208,7 @@ static struct inode *anon_inode_mkinode(void)
        inode->i_mode = S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
+        inode->i_flags |= S_PRIVATE;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        return inode;
 }
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 4a1401cea0a1..8713c7cfbc79 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -13,6 +13,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/smp_lock.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index c8a80dffb455..d29b7f6df862 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -22,6 +22,7 @@
 #include <linux/magic.h>
 #include <linux/dcache.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 #include "autofs_i.h"
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index a015b49891df..e8e5e63ac950 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -15,6 +15,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
 #include "autofs_i.h"
@@ -176,8 +177,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
                }
        /* Trigger mount for path component or follow link */
        } else if (ino->flags & AUTOFS_INF_PENDING ||
-                        autofs4_need_mount(flags) ||
+                        autofs4_need_mount(flags)) {
-                        current->link_count) {
                DPRINTK("waiting for mount name=%.*s",
                        dentry->d_name.len, dentry->d_name.name);
@@ -261,7 +261,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
                spin_unlock(&dcache_lock);
                spin_unlock(&sbi->fs_lock);
-                status = try_to_fill_dentry(dentry, 0);
+                status = try_to_fill_dentry(dentry, nd->flags);
                if (status)
                        goto out_error;
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index e3287d0d1a58..59096b5e0fc7 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -11,7 +11,6 @@
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/string.h>
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 15d80bb35d6f..f96eff04e11a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -20,11 +20,11 @@
 #include <linux/fcntl.h>
 #include <linux/ptrace.h>
 #include <linux/user.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/personality.h>
 #include <linux/init.h>
 #include <linux/coredump.h>
+#include <linux/slab.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -75,14 +75,16 @@ static int aout_core_dump(struct coredump_params *cprm)
        struct file *file = cprm->file;
        mm_segment_t fs;
        int has_dumped = 0;
-        unsigned long dump_start, dump_size;
+        void __user *dump_start;
+        int dump_size;
        struct user dump;
 #ifdef __alpha__
-#       define START_DATA(u)    (u.start_data)
+#       define START_DATA(u)    ((void __user *)u.start_data)
 #else
-#       define START_DATA(u)    ((u.u_tsize << PAGE_SHIFT) + u.start_code)
+#       define START_DATA(u)    ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
+                                 u.start_code))
 #endif
-#       define START_STACK(u)   (u.start_stack)
+#       define START_STACK(u)   ((void __user *)u.start_stack)
        fs = get_fs();
        set_fs(KERNEL_DS);
@@ -104,9 +106,9 @@ static int aout_core_dump(struct coredump_params *cprm)
 /* make sure we actually have a data and stack area to dump */
        set_fs(USER_DS);
-        if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+        if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
                dump.u_dsize = 0;
-        if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+        if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
                dump.u_ssize = 0;
        set_fs(KERNEL_DS);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 6d6a16c5e9bb..2c5f9a0e5d72 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1005,15 +1005,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
                                }
                        } else if (!mm->start_data) {
                                mm->start_data = seg->addr;
-#ifndef CONFIG_MMU
                                mm->end_data = seg->addr + phdr->p_memsz;
-#endif
                        }
-#ifdef CONFIG_MMU
-                        if (seg->addr + phdr->p_memsz > mm->end_data)
-                                mm->end_data = seg->addr + phdr->p_memsz;
-#endif
                }
                seg++;
@@ -1374,7 +1367,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type
 /*
 * fill up all the fields in prstatus from the given task struct, except
- * registers which need to be filled up seperately.
+ * registers which need to be filled up separately.
 */
 static void fill_prstatus(struct elf_prstatus *prstatus,
                          struct task_struct *p, long signr)
@@ -1590,7 +1583,7 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags)
        struct vm_area_struct *vma;
        size_t size = 0;
-        for (vma = current->mm->mmap; vma; vma->vm_next)
+        for (vma = current->mm->mmap; vma; vma = vma->vm_next)
                if (maydump(vma, mm_flags))
                        size += vma->vm_end - vma->vm_start;
        return size;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 32fb00b52cd0..b8e8b0acf9bd 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -11,7 +11,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/elf.h>
 #include <linux/init.h>
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e0e769bdca59..49566c1687d8 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -355,7 +355,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
        if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
                printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
-                       (int) r,(int)(start_brk-start_code),(int)text_len);
+                       (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
                goto failed;
        }
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 08343505e184..aca9d55afb22 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -8,7 +8,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/init.h>
 #include <linux/file.h>
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index a16f29e888cd..612a5c38d3c1 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -24,6 +24,7 @@
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
+#include <linux/slab.h>
 struct integrity_slab {
        struct kmem_cache *slab;
diff --git a/fs/bio.c b/fs/bio.c
index dc17afd672e3..e7bf6ca64dcf 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -264,13 +264,12 @@ EXPORT_SYMBOL(bio_init);
 * bio_alloc_bioset - allocate a bio for I/O
 * @gfp_mask:   the GFP_ mask given to the slab allocator
 * @nr_iovecs:  number of iovecs to pre-allocate
- * @bs:         the bio_set to allocate from. If %NULL, just use kmalloc
+ * @bs:         the bio_set to allocate from.
 *
 * Description:
- *   bio_alloc_bioset will first try its own mempool to satisfy the allocation.
+ *   bio_alloc_bioset will try its own mempool to satisfy the allocation.
 *   If %__GFP_WAIT is set then we will block on the internal pool waiting
- *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
+ *   for a &struct bio to become free.
- *   fall back to just using @kmalloc to allocate the required memory.
 *
 *   Note that the caller must set ->bi_destructor on successful return
 *   of a bio, to do the appropriate freeing of the bio once the reference
@@ -555,7 +554,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                                        .bi_rw = bio->bi_rw,
                                };
-                                if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+                                if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
                                        prev->bv_len -= len;
                                        return 0;
                                }
@@ -608,7 +607,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                 * merge_bvec_fn() returns number of bytes it can accept
                 * at this offset
                 */
-                if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
+                if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
                        bvec->bv_page = NULL;
                        bvec->bv_len = 0;
                        bvec->bv_offset = 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d11d0289f3d2..6dcee88c2e5d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -404,20 +404,28 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 *      NULL first argument is nfsd_sync_dir() and that's not a directory.
 */
 
-static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
-        struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+        struct inode *bd_inode = filp->f_mapping->host;
+        struct block_device *bdev = I_BDEV(bd_inode);
        int error;
-        error = sync_blockdev(bdev);
+        /*
-        if (error)
+         * There is no need to serialise calls to blkdev_issue_flush with
-                return error;
+         * i_mutex and doing so causes performance issues with concurrent
-        
+         * O_SYNC writers to a block device.
+         */
+        mutex_unlock(&bd_inode->i_mutex);
        error = blkdev_issue_flush(bdev, NULL);
        if (error == -EOPNOTSUPP)
                error = 0;
+        mutex_lock(&bd_inode->i_mutex);
        return error;
 }
+EXPORT_SYMBOL(blkdev_fsync);
 /*
 * pseudo-fs
@@ -1481,7 +1489,7 @@ const struct file_operations def_blk_fops = {
        .aio_read       = generic_file_aio_read,
        .aio_write      = blkdev_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = block_fsync,
+        .fsync          = blkdev_fsync,
        .unlocked_ioctl = block_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = compat_blkdev_ioctl,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6df6d6ed74fd..6ef7b26724ec 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "btrfs_inode.h"
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..462859a30141 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
 */
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3f1f50d9d916..7a4dee199832 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -153,6 +153,11 @@ struct btrfs_inode {
        unsigned ordered_data_close:1;
        unsigned dummy_inode:1;
+        /*
+         * always compress this one file
+         */
+        unsigned force_compress:1;
        struct inode vfs_inode;
 };
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a11a32058b50..396039b3a8a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,7 +31,7 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
-#include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -445,7 +445,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
        unsigned long nr_pages = 0;
        struct extent_map *em;
        struct address_space *mapping = inode->i_mapping;
-        struct pagevec pvec;
        struct extent_map_tree *em_tree;
        struct extent_io_tree *tree;
        u64 end;
@@ -461,7 +460,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
        end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
-        pagevec_init(&pvec, 0);
        while (last_offset < compressed_end) {
                page_index = last_offset >> PAGE_CACHE_SHIFT;
@@ -478,26 +476,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                        goto next;
                }
-                page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
+                page = __page_cache_alloc(mapping_gfp_mask(mapping) &
+                                                                ~__GFP_FS);
                if (!page)
                        break;
-                page->index = page_index;
+                if (add_to_page_cache_lru(page, mapping, page_index,
-                /*
+                                                                GFP_NOFS)) {
-                 * what we want to do here is call add_to_page_cache_lru,
-                 * but that isn't exported, so we reproduce it here
-                 */
-                if (add_to_page_cache(page, mapping,
-                                      page->index, GFP_NOFS)) {
                        page_cache_release(page);
                        goto next;
                }
-                /* open coding of lru_cache_add, also not exported */
-                page_cache_get(page);
-                if (!pagevec_add(&pvec, page))
-                        __pagevec_lru_add_file(&pvec);
                end = last_offset + PAGE_CACHE_SIZE - 1;
                /*
                 * at this point, we have a locked page in the page cache
@@ -551,8 +540,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 next:
                last_offset += PAGE_CACHE_SIZE;
        }
-        if (pagevec_count(&pvec))
-                __pagevec_lru_add_file(&pvec);
        return 0;
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4bc570a396e..6795a713b205 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -3040,6 +3041,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
        if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
                goto err;
+        /* the leaf has  changed, it now has room.  return now */
+        if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
+                goto err;
        if (key.type == BTRFS_EXTENT_DATA_KEY) {
                fi = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_file_extent_item);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8b5cfdd4bfc1..746a7248678e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,7 @@
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
+#include <linux/slab.h>
 #include <asm/kmap_types.h>
 #include "extent_io.h"
 #include "extent_map.h"
@@ -373,11 +374,13 @@ struct btrfs_super_block {
 * ones specified below then we will fail to mount
 */
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF    (1ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL   (2ULL << 0)
 #define BTRFS_FEATURE_COMPAT_SUPP               0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP            0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP             \
-        BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
+        (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
 /*
 * A leaf is full of items. offset and size tell us where to find
@@ -832,7 +835,6 @@ struct btrfs_fs_info {
        u64 last_trans_log_full_commit;
        u64 open_ioctl_trans;
        unsigned long mount_opt;
-        u64 max_extent;
        u64 max_inline;
        u64 alloc_start;
        struct btrfs_transaction *running_transaction;
@@ -1182,7 +1184,6 @@ struct btrfs_root {
 #define BTRFS_INODE_NOATIME             (1 << 9)
 #define BTRFS_INODE_DIRSYNC             (1 << 10)
 /* some macros to generate set/get funcs for the struct fields.  This
 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
 * one for u8:
@@ -1842,7 +1843,7 @@ BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
 BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
                         compat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
-                         compat_flags, 64);
+                         compat_ro_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
                         incompat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
@@ -2310,7 +2311,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               u32 min_type);
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+                              struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
                     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2335,7 +2337,7 @@ int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-                         struct btrfs_root *root);
+                         struct btrfs_root *root, int *was_new);
 int btrfs_commit_write(struct file *file, struct page *page,
                       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2386,7 +2388,6 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 /* super.c */
-u64 btrfs_parse_size(char *str);
 int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..902ce507c4e3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/sort.h>
 #include "ctree.h"
 #include "delayed-ref.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2b59201b955c..feca04197d02 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/crc32c.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -43,8 +44,6 @@ static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
-static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
 /*
 * end_io_wq structs are used to do processing in task context when an IO is
 * complete.  This is used during reads to verify checksums, and it is used
@@ -263,13 +262,15 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 static int verify_parent_transid(struct extent_io_tree *io_tree,
                                 struct extent_buffer *eb, u64 parent_transid)
 {
+        struct extent_state *cached_state = NULL;
        int ret;
        if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
                return 0;
-        lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
+        lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
-        if (extent_buffer_uptodate(io_tree, eb) &&
+                         0, &cached_state, GFP_NOFS);
+        if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
            btrfs_header_generation(eb) == parent_transid) {
                ret = 0;
                goto out;
@@ -282,10 +283,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
                       (unsigned long long)btrfs_header_generation(eb));
        }
        ret = 1;
-        clear_extent_buffer_uptodate(io_tree, eb);
+        clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
 out:
-        unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+        unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
-                      GFP_NOFS);
+                             &cached_state, GFP_NOFS);
        return ret;
 }
@@ -901,7 +902,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->highest_objectid = 0;
        root->name = NULL;
        root->in_sysfs = 0;
-        root->inode_tree.rb_node = NULL;
+        root->inode_tree = RB_ROOT;
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->orphan_list);
@@ -1372,19 +1373,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
        int err;
-        bdi->name = "btrfs";
        bdi->capabilities = BDI_CAP_MAP_COPY;
-        err = bdi_init(bdi);
+        err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
        if (err)
                return err;
-        err = bdi_register(bdi, NULL, "btrfs-%d",
-                                atomic_inc_return(&btrfs_bdi_num));
-        if (err) {
-                bdi_destroy(bdi);
-                return err;
-        }
        bdi->ra_pages   = default_backing_dev_info.ra_pages;
        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
        bdi->unplug_io_data     = info;
@@ -1632,7 +1625,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        atomic_set(&fs_info->async_submit_draining, 0);
        atomic_set(&fs_info->nr_async_bios, 0);
        fs_info->sb = sb;
-        fs_info->max_extent = (u64)-1;
        fs_info->max_inline = 8192 * 1024;
        fs_info->metadata_ratio = 0;
@@ -1673,7 +1665,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        insert_inode_hash(fs_info->btree_inode);
        spin_lock_init(&fs_info->block_group_cache_lock);
-        fs_info->block_group_cache_tree.rb_node = NULL;
+        fs_info->block_group_cache_tree = RB_ROOT;
        extent_io_tree_init(&fs_info->freed_extents[0],
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -1920,7 +1912,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        csum_root->track_dirty = 1;
-        btrfs_read_block_groups(extent_root);
+        ret = btrfs_read_block_groups(extent_root);
+        if (ret) {
+                printk(KERN_ERR "Failed to read block groups: %d\n", ret);
+                goto fail_block_groups;
+        }
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
@@ -1930,7 +1926,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                               "btrfs-cleaner");
        if (IS_ERR(fs_info->cleaner_kthread))
-                goto fail_csum_root;
+                goto fail_block_groups;
        fs_info->transaction_kthread = kthread_run(transaction_kthread,
                                                   tree_root,
@@ -2018,7 +2014,8 @@ fail_cleaner:
        filemap_write_and_wait(fs_info->btree_inode->i_mapping);
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
-fail_csum_root:
+fail_block_groups:
+        btrfs_free_block_groups(fs_info);
        free_extent_buffer(csum_root->node);
        free_extent_buffer(csum_root->commit_root);
 fail_dev_root:
@@ -2497,7 +2494,8 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
        int ret;
        struct inode *btree_inode = buf->first_page->mapping->host;
-        ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+        ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
+                                     NULL);
        if (!ret)
                return ret;
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ba5c3fd5ab8c..951ef09b82f4 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -95,7 +95,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
        key.offset = 0;
-        inode = btrfs_iget(sb, &key, root);
+        inode = btrfs_iget(sb, &key, root, NULL);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto fail;
@@ -223,7 +223,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-        dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+        dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
        if (!IS_ERR(dentry))
                dentry->d_op = &btrfs_dentry_operations;
        return dentry;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 559f72489b3b..b34d32fdaaec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
 #include <linux/sort.h>
 #include <linux/rcupdate.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -2676,6 +2677,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        INIT_LIST_HEAD(&found->block_groups);
        init_rwsem(&found->groups_sem);
+        init_waitqueue_head(&found->flush_wait);
+        init_waitqueue_head(&found->allocate_wait);
        spin_lock_init(&found->lock);
        found->flags = flags;
        found->total_bytes = total_bytes;
@@ -2846,7 +2849,7 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
        }
        spin_unlock(&BTRFS_I(inode)->accounting_lock);
-        BTRFS_I(inode)->reserved_extents--;
+        BTRFS_I(inode)->reserved_extents -= num_items;
        BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
        if (meta_sinfo->bytes_delalloc < num_bytes) {
@@ -2944,12 +2947,10 @@ static void flush_delalloc(struct btrfs_root *root,
        spin_lock(&info->lock);
-        if (!info->flushing) {
+        if (!info->flushing)
                info->flushing = 1;
-                init_waitqueue_head(&info->flush_wait);
+        else
-        } else {
                wait = true;
-        }
        spin_unlock(&info->lock);
@@ -3011,7 +3012,6 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
        if (!info->allocating_chunk) {
                info->force_alloc = 1;
                info->allocating_chunk = 1;
-                init_waitqueue_head(&info->allocate_wait);
        } else {
                wait = true;
        }
@@ -3111,7 +3111,7 @@ again:
                return -ENOSPC;
        }
-        BTRFS_I(inode)->reserved_extents++;
+        BTRFS_I(inode)->reserved_extents += num_items;
        check_force_delalloc(meta_sinfo);
        spin_unlock(&meta_sinfo->lock);
@@ -3235,7 +3235,8 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
                                u64 bytes)
 {
        struct btrfs_space_info *data_sinfo;
-        int ret = 0, committed = 0;
+        u64 used;
+        int ret = 0, committed = 0, flushed = 0;
        /* make sure bytes are sectorsize aligned */
        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3247,12 +3248,21 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 again:
        /* make sure we have enough space to handle the data first */
        spin_lock(&data_sinfo->lock);
-        if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+        used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
-            data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+                data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
-            data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+                data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
-            data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
+                data_sinfo->bytes_super;
+        if (used + bytes > data_sinfo->total_bytes) {
                struct btrfs_trans_handle *trans;
+                if (!flushed) {
+                        spin_unlock(&data_sinfo->lock);
+                        flush_delalloc(root, data_sinfo);
+                        flushed = 1;
+                        goto again;
+                }
                /*
                 * if we don't have enough free bytes in this space then we need
                 * to alloc a new chunk.
@@ -4170,6 +4180,10 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        ins->offset = 0;
        space_info = __find_space_info(root->fs_info, data);
+        if (!space_info) {
+                printk(KERN_ERR "No space info for %d\n", data);
+                return -ENOSPC;
+        }
        if (orig_root->ref_cows || empty_size)
                allowed_chunk_alloc = 1;
@@ -5205,6 +5219,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        next = btrfs_find_tree_block(root, bytenr, blocksize);
        if (!next) {
                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+                if (!next)
+                        return -ENOMEM;
                reada = 1;
        }
        btrfs_tree_lock(next);
@@ -5417,7 +5433,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                if (ret > 0) {
                        path->slots[level]++;
                        continue;
-                }
+                } else if (ret < 0)
+                        return ret;
                level = wc->level;
        }
        return 0;
@@ -6561,6 +6578,7 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
        struct btrfs_key key;
        struct inode *inode = NULL;
        struct btrfs_file_extent_item *fi;
+        struct extent_state *cached_state = NULL;
        u64 num_bytes;
        u64 skip_objectid = 0;
        u32 nritems;
@@ -6589,12 +6607,14 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
                }
                num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
-                lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
-                            key.offset + num_bytes - 1, GFP_NOFS);
+                                 key.offset + num_bytes - 1, 0, &cached_state,
+                                 GFP_NOFS);
                btrfs_drop_extent_cache(inode, key.offset,
                                        key.offset + num_bytes - 1, 1);
-                unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
-                              key.offset + num_bytes - 1, GFP_NOFS);
+                                     key.offset + num_bytes - 1, &cached_state,
+                                     GFP_NOFS);
                cond_resched();
        }
        iput(inode);
@@ -7366,7 +7386,6 @@ static int find_first_block_group(struct btrfs_root *root,
                }
                path->slots[0]++;
        }
-        ret = -ENOENT;
 out:
        return ret;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b177ed319612..d2d03684fab2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
 #include <linux/slab.h>
 #include <linux/bio.h>
 #include <linux/mm.h>
-#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/page-flags.h>
 #include <linux/module.h>
@@ -104,8 +103,8 @@ void extent_io_exit(void)
 void extent_io_tree_init(struct extent_io_tree *tree,
                          struct address_space *mapping, gfp_t mask)
 {
-        tree->state.rb_node = NULL;
+        tree->state = RB_ROOT;
-        tree->buffer.rb_node = NULL;
+        tree->buffer = RB_ROOT;
        tree->ops = NULL;
        tree->dirty_bytes = 0;
        spin_lock_init(&tree->lock);
@@ -513,7 +512,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        u64 last_end;
        int err;
        int set = 0;
+        int clear = 0;
+        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+                clear = 1;
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
                prealloc = alloc_extent_state(mask);
@@ -524,14 +526,20 @@ again:
        spin_lock(&tree->lock);
        if (cached_state) {
                cached = *cached_state;
-                *cached_state = NULL;
-                cached_state = NULL;
+                if (clear) {
+                        *cached_state = NULL;
+                        cached_state = NULL;
+                }
                if (cached && cached->tree && cached->start == start) {
-                        atomic_dec(&cached->refs);
+                        if (clear)
+                                atomic_dec(&cached->refs);
                        state = cached;
                        goto hit_next;
                }
-                free_extent_state(cached);
+                if (clear)
+                        free_extent_state(cached);
        }
        /*
         * this search will find the extents that end after
@@ -946,11 +954,11 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 }
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
-                     gfp_t mask)
+                        struct extent_state **cached_state, gfp_t mask)
 {
        return set_extent_bit(tree, start, end,
                              EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
-                              0, NULL, NULL, mask);
+                              0, NULL, cached_state, mask);
 }
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -984,10 +992,11 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 }
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
-                                 u64 end, gfp_t mask)
+                                 u64 end, struct extent_state **cached_state,
+                                 gfp_t mask)
 {
        return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
-                                NULL, mask);
+                                cached_state, mask);
 }
 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1171,7 +1180,8 @@ out:
 * 1 is returned if we find something, 0 if nothing was in the tree
 */
 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
-                                        u64 *start, u64 *end, u64 max_bytes)
+                                        u64 *start, u64 *end, u64 max_bytes,
+                                        struct extent_state **cached_state)
 {
        struct rb_node *node;
        struct extent_state *state;
@@ -1203,8 +1213,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
                                *end = state->end;
                        goto out;
                }
-                if (!found)
+                if (!found) {
                        *start = state->start;
+                        *cached_state = state;
+                        atomic_inc(&state->refs);
+                }
                found++;
                *end = state->end;
                cur_start = state->end + 1;
@@ -1336,10 +1349,11 @@ again:
        delalloc_start = *start;
        delalloc_end = 0;
        found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
-                                    max_bytes);
+                                    max_bytes, &cached_state);
        if (!found || delalloc_end <= *start) {
                *start = delalloc_start;
                *end = delalloc_end;
+                free_extent_state(cached_state);
                return found;
        }
@@ -1722,7 +1736,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
                }
                if (!uptodate) {
-                        clear_extent_uptodate(tree, start, end, GFP_NOFS);
+                        clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
                        ClearPageUptodate(page);
                        SetPageError(page);
                }
@@ -1750,7 +1764,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 static void end_bio_extent_readpage(struct bio *bio, int err)
 {
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct bio_vec *bvec = bio->bi_io_vec;
        struct extent_io_tree *tree;
        u64 start;
        u64 end;
@@ -1773,7 +1788,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                else
                        whole_page = 0;
-                if (--bvec >= bio->bi_io_vec)
+                if (++bvec <= bvec_end)
                        prefetchw(&bvec->bv_page->flags);
                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
@@ -1818,7 +1833,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                        }
                        check_page_locked(tree, page);
                }
-        } while (bvec >= bio->bi_io_vec);
+        } while (bvec <= bvec_end);
        bio_put(bio);
 }
@@ -2663,33 +2678,20 @@ int extent_readpages(struct extent_io_tree *tree,
 {
        struct bio *bio = NULL;
        unsigned page_idx;
-        struct pagevec pvec;
        unsigned long bio_flags = 0;
-        pagevec_init(&pvec, 0);
        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
                struct page *page = list_entry(pages->prev, struct page, lru);
                prefetchw(&page->flags);
                list_del(&page->lru);
-                /*
+                if (!add_to_page_cache_lru(page, mapping,
-                 * what we want to do here is call add_to_page_cache_lru,
-                 * but that isn't exported, so we reproduce it here
-                 */
-                if (!add_to_page_cache(page, mapping,
                                        page->index, GFP_KERNEL)) {
-                        /* open coding of lru_cache_add, also not exported */
-                        page_cache_get(page);
-                        if (!pagevec_add(&pvec, page))
-                                __pagevec_lru_add_file(&pvec);
                        __extent_read_full_page(tree, page, get_extent,
                                                &bio, 0, &bio_flags);
                }
                page_cache_release(page);
        }
-        if (pagevec_count(&pvec))
-                __pagevec_lru_add_file(&pvec);
        BUG_ON(!list_empty(pages));
        if (bio)
                submit_one_bio(READ, bio, 0, bio_flags);
@@ -2704,6 +2706,7 @@ int extent_readpages(struct extent_io_tree *tree,
 int extent_invalidatepage(struct extent_io_tree *tree,
                          struct page *page, unsigned long offset)
 {
+        struct extent_state *cached_state = NULL;
        u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        size_t blocksize = page->mapping->host->i_sb->s_blocksize;
@@ -2712,12 +2715,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
        if (start > end)
                return 0;
-        lock_extent(tree, start, end, GFP_NOFS);
+        lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
        wait_on_page_writeback(page);
        clear_extent_bit(tree, start, end,
                         EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
                         EXTENT_DO_ACCOUNTING,
-                         1, 1, NULL, GFP_NOFS);
+                         1, 1, &cached_state, GFP_NOFS);
        return 0;
 }
@@ -2920,16 +2923,17 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
                get_extent_t *get_extent)
 {
        struct inode *inode = mapping->host;
+        struct extent_state *cached_state = NULL;
        u64 start = iblock << inode->i_blkbits;
        sector_t sector = 0;
        size_t blksize = (1 << inode->i_blkbits);
        struct extent_map *em;
-        lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
-                    GFP_NOFS);
+                         0, &cached_state, GFP_NOFS);
        em = get_extent(inode, NULL, 0, start, blksize, 0);
-        unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
-                      GFP_NOFS);
+                             start + blksize - 1, &cached_state, GFP_NOFS);
        if (!em || IS_ERR(em))
                return 0;
@@ -2951,6 +2955,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        u32 flags = 0;
        u64 disko = 0;
        struct extent_map *em = NULL;
+        struct extent_state *cached_state = NULL;
        int end = 0;
        u64 em_start = 0, em_len = 0;
        unsigned long emflags;
@@ -2959,8 +2964,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        if (len == 0)
                return -EINVAL;
-        lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
+        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
-                GFP_NOFS);
+                         &cached_state, GFP_NOFS);
        em = get_extent(inode, NULL, 0, off, max - off, 0);
        if (!em)
                goto out;
@@ -3023,8 +3028,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 out_free:
        free_extent_map(em);
 out:
-        unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
-                        GFP_NOFS);
+                             &cached_state, GFP_NOFS);
        return ret;
 }
@@ -3264,7 +3269,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 }
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-                                struct extent_buffer *eb)
+                                struct extent_buffer *eb,
+                                struct extent_state **cached_state)
 {
        unsigned long i;
        struct page *page;
@@ -3274,7 +3280,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
        clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                              GFP_NOFS);
+                              cached_state, GFP_NOFS);
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
                if (page)
@@ -3334,7 +3340,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 }
 int extent_buffer_uptodate(struct extent_io_tree *tree,
-                           struct extent_buffer *eb)
+                           struct extent_buffer *eb,
+                           struct extent_state *cached_state)
 {
        int ret = 0;
        unsigned long num_pages;
@@ -3346,7 +3353,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
                return 1;
        ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-                           EXTENT_UPTODATE, 1, NULL);
+                           EXTENT_UPTODATE, 1, cached_state);
        if (ret)
                return ret;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 36de250a7b2b..bbab4813646f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -163,6 +163,8 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                     int bits, struct extent_state **cached, gfp_t mask);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+                         struct extent_state **cached, gfp_t mask);
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
                    gfp_t mask);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
@@ -196,7 +198,7 @@ int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
                                  u64 end, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
-                     gfp_t mask);
+                        struct extent_state **cached_state, gfp_t mask);
 int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -281,9 +283,11 @@ int test_extent_buffer_dirty(struct extent_io_tree *tree,
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
                               struct extent_buffer *eb);
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-                                struct extent_buffer *eb);
+                                struct extent_buffer *eb,
+                                struct extent_state **cached_state);
 int extent_buffer_uptodate(struct extent_io_tree *tree,
-                           struct extent_buffer *eb);
+                           struct extent_buffer *eb,
+                           struct extent_state *cached_state);
 int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
                      unsigned long min_len, char **token, char **map,
                      unsigned long *map_start,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 428fcac45f90..454ca52d6451 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,5 +1,4 @@
 #include <linux/err.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
@@ -35,7 +34,7 @@ void extent_map_exit(void)
 */
 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 {
-        tree->map.rb_node = NULL;
+        tree->map = RB_ROOT;
        rwlock_init(&tree->lock);
 }
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..54a255065aa3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
 */
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include "ctree.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6ed434ac037f..29ff749ff4ca 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -123,7 +124,8 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
                    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
        end_of_last_block = start_pos + num_bytes - 1;
-        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+                                        NULL);
        if (err)
                return err;
@@ -753,6 +755,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
                         loff_t pos, unsigned long first_index,
                         unsigned long last_index, size_t write_bytes)
 {
+        struct extent_state *cached_state = NULL;
        int i;
        unsigned long index = pos >> PAGE_CACHE_SHIFT;
        struct inode *inode = fdentry(file)->d_inode;
@@ -781,16 +784,18 @@ again:
        }
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
-                lock_extent(&BTRFS_I(inode)->io_tree,
+                lock_extent_bits(&BTRFS_I(inode)->io_tree,
-                            start_pos, last_pos - 1, GFP_NOFS);
+                                 start_pos, last_pos - 1, 0, &cached_state,
+                                 GFP_NOFS);
                ordered = btrfs_lookup_first_ordered_extent(inode,
                                                            last_pos - 1);
                if (ordered &&
                    ordered->file_offset + ordered->len > start_pos &&
                    ordered->file_offset < last_pos) {
                        btrfs_put_ordered_extent(ordered);
-                        unlock_extent(&BTRFS_I(inode)->io_tree,
+                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                      start_pos, last_pos - 1, GFP_NOFS);
+                                             start_pos, last_pos - 1,
+                                             &cached_state, GFP_NOFS);
                        for (i = 0; i < num_pages; i++) {
                                unlock_page(pages[i]);
                                page_cache_release(pages[i]);
@@ -802,12 +807,13 @@ again:
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
-                clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
                                  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-                                  EXTENT_DO_ACCOUNTING,
+                                  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
                                  GFP_NOFS);
-                unlock_extent(&BTRFS_I(inode)->io_tree,
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                              start_pos, last_pos - 1, GFP_NOFS);
+                                     start_pos, last_pos - 1, &cached_state,
+                                     GFP_NOFS);
        }
        for (i = 0; i < num_pages; i++) {
                clear_page_dirty_for_io(pages[i]);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cb2849f03251..f488fac04d99 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
 #include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/math64.h>
 #include "ctree.h"
 #include "free-space-cache.h"
@@ -870,7 +871,7 @@ __btrfs_return_cluster_to_free_space(
                tree_insert_offset(&block_group->free_space_offset,
                                   entry->offset, &entry->offset_index, 0);
        }
-        cluster->root.rb_node = NULL;
+        cluster->root = RB_ROOT;
 out:
        spin_unlock(&cluster->lock);
@@ -1355,7 +1356,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
 {
        spin_lock_init(&cluster->lock);
        spin_lock_init(&cluster->refill_lock);
-        cluster->root.rb_node = NULL;
+        cluster->root = RB_ROOT;
        cluster->max_size = 0;
        cluster->points_to_bitmap = false;
        INIT_LIST_HEAD(&cluster->block_group_list);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c41db6d45ab6..2bfdc641d4e3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -379,7 +380,8 @@ again:
         * change at any time if we discover bad compression ratios.
         */
        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
-            btrfs_test_opt(root, COMPRESS)) {
+            (btrfs_test_opt(root, COMPRESS) ||
+             (BTRFS_I(inode)->force_compress))) {
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
@@ -483,8 +485,10 @@ again:
                nr_pages_ret = 0;
                /* flag the file so we don't compress in the future */
-                if (!btrfs_test_opt(root, FORCE_COMPRESS))
+                if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
+                    !(BTRFS_I(inode)->force_compress)) {
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+                }
        }
        if (will_compress) {
                *num_added += 1;
@@ -570,8 +574,8 @@ retry:
                        unsigned long nr_written = 0;
                        lock_extent(io_tree, async_extent->start,
-                                    async_extent->start +
+                                         async_extent->start +
-                                    async_extent->ram_size - 1, GFP_NOFS);
+                                         async_extent->ram_size - 1, GFP_NOFS);
                        /* allocate blocks */
                        ret = cow_file_range(inode, async_cow->locked_page,
@@ -793,7 +797,7 @@ static noinline int cow_file_range(struct inode *inode,
        while (disk_num_bytes > 0) {
                unsigned long op;
-                cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+                cur_alloc_size = disk_num_bytes;
                ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
                                           root->sectorsize, 0, alloc_hint,
                                           (u64)-1, &ins, 1);
@@ -1211,7 +1215,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
        else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
-        else if (!btrfs_test_opt(root, COMPRESS))
+        else if (!btrfs_test_opt(root, COMPRESS) &&
+                 !(BTRFS_I(inode)->force_compress))
                ret = cow_file_range(inode, locked_page, start, end,
                                      page_started, nr_written, 1);
        else
@@ -1223,30 +1228,9 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 static int btrfs_split_extent_hook(struct inode *inode,
                                    struct extent_state *orig, u64 split)
 {
-        struct btrfs_root *root = BTRFS_I(inode)->root;
-        u64 size;
        if (!(orig->state & EXTENT_DELALLOC))
                return 0;
-        size = orig->end - orig->start + 1;
-        if (size > root->fs_info->max_extent) {
-                u64 num_extents;
-                u64 new_size;
-                new_size = orig->end - split + 1;
-                num_extents = div64_u64(size + root->fs_info->max_extent - 1,
-                                        root->fs_info->max_extent);
-                /*
-                 * if we break a large extent up then leave oustanding_extents
-                 * be, since we've already accounted for the large extent.
-                 */
-                if (div64_u64(new_size + root->fs_info->max_extent - 1,
-                              root->fs_info->max_extent) < num_extents)
-                        return 0;
-        }
        spin_lock(&BTRFS_I(inode)->accounting_lock);
        BTRFS_I(inode)->outstanding_extents++;
        spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1264,38 +1248,10 @@ static int btrfs_merge_extent_hook(struct inode *inode,
                                   struct extent_state *new,
                                   struct extent_state *other)
 {
-        struct btrfs_root *root = BTRFS_I(inode)->root;
-        u64 new_size, old_size;
-        u64 num_extents;
        /* not delalloc, ignore it */
        if (!(other->state & EXTENT_DELALLOC))
                return 0;
-        old_size = other->end - other->start + 1;
-        if (new->start < other->start)
-                new_size = other->end - new->start + 1;
-        else
-                new_size = new->end - other->start + 1;
-        /* we're not bigger than the max, unreserve the space and go */
-        if (new_size <= root->fs_info->max_extent) {
-                spin_lock(&BTRFS_I(inode)->accounting_lock);
-                BTRFS_I(inode)->outstanding_extents--;
-                spin_unlock(&BTRFS_I(inode)->accounting_lock);
-                return 0;
-        }
-        /*
-         * If we grew by another max_extent, just return, we want to keep that
-         * reserved amount.
-         */
-        num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
-                                root->fs_info->max_extent);
-        if (div64_u64(new_size + root->fs_info->max_extent - 1,
-                      root->fs_info->max_extent) > num_extents)
-                return 0;
        spin_lock(&BTRFS_I(inode)->accounting_lock);
        BTRFS_I(inode)->outstanding_extents--;
        spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1324,6 +1280,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
                BTRFS_I(inode)->outstanding_extents++;
                spin_unlock(&BTRFS_I(inode)->accounting_lock);
                btrfs_delalloc_reserve_space(root, inode, end - start + 1);
                spin_lock(&root->fs_info->delalloc_lock);
                BTRFS_I(inode)->delalloc_bytes += end - start + 1;
                root->fs_info->delalloc_bytes += end - start + 1;
@@ -1352,6 +1309,7 @@ static int btrfs_clear_bit_hook(struct inode *inode,
                if (bits & EXTENT_DO_ACCOUNTING) {
                        spin_lock(&BTRFS_I(inode)->accounting_lock);
+                        WARN_ON(!BTRFS_I(inode)->outstanding_extents);
                        BTRFS_I(inode)->outstanding_extents--;
                        spin_unlock(&BTRFS_I(inode)->accounting_lock);
                        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -1508,12 +1466,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
        return 0;
 }
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+                              struct extent_state **cached_state)
 {
        if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
                WARN_ON(1);
        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
-                                   GFP_NOFS);
+                                   cached_state, GFP_NOFS);
 }
 /* see btrfs_writepage_start_hook for details on why this is required */
@@ -1526,6 +1485,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 {
        struct btrfs_writepage_fixup *fixup;
        struct btrfs_ordered_extent *ordered;
+        struct extent_state *cached_state = NULL;
        struct page *page;
        struct inode *inode;
        u64 page_start;
@@ -1544,7 +1504,8 @@ again:
        page_start = page_offset(page);
        page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
-        lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+        lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+                         &cached_state, GFP_NOFS);
        /* already ordered? We're done */
        if (PagePrivate2(page))
@@ -1552,17 +1513,18 @@ again:
        ordered = btrfs_lookup_ordered_extent(inode, page_start);
        if (ordered) {
-                unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
-                              page_end, GFP_NOFS);
+                                     page_end, &cached_state, GFP_NOFS);
                unlock_page(page);
                btrfs_start_ordered_extent(inode, ordered, 1);
                goto again;
        }
-        btrfs_set_extent_delalloc(inode, page_start, page_end);
+        btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
        ClearPageChecked(page);
 out:
-        unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
+                             &cached_state, GFP_NOFS);
 out_page:
        unlock_page(page);
        page_cache_release(page);
@@ -1691,14 +1653,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        struct btrfs_trans_handle *trans;
        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct extent_state *cached_state = NULL;
        int compressed = 0;
        int ret;
-        ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+        ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+                                             end - start + 1);
        if (!ret)
                return 0;
-        ordered_extent = btrfs_lookup_ordered_extent(inode, start);
        BUG_ON(!ordered_extent);
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
@@ -1713,9 +1675,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                goto out;
        }
-        lock_extent(io_tree, ordered_extent->file_offset,
+        lock_extent_bits(io_tree, ordered_extent->file_offset,
-                    ordered_extent->file_offset + ordered_extent->len - 1,
+                         ordered_extent->file_offset + ordered_extent->len - 1,
-                    GFP_NOFS);
+                         0, &cached_state, GFP_NOFS);
        trans = btrfs_join_transaction(root, 1);
@@ -1742,9 +1704,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                   ordered_extent->len);
                BUG_ON(ret);
        }
-        unlock_extent(io_tree, ordered_extent->file_offset,
+        unlock_extent_cached(io_tree, ordered_extent->file_offset,
-                    ordered_extent->file_offset + ordered_extent->len - 1,
+                             ordered_extent->file_offset +
-                    GFP_NOFS);
+                             ordered_extent->len - 1, &cached_state, GFP_NOFS);
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
@@ -2153,7 +2116,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                found_key.objectid = found_key.offset;
                found_key.type = BTRFS_INODE_ITEM_KEY;
                found_key.offset = 0;
-                inode = btrfs_iget(root->fs_info->sb, &found_key, root);
+                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
                if (IS_ERR(inode))
                        break;
@@ -3081,6 +3044,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
+        struct extent_state *cached_state = NULL;
        char *kaddr;
        u32 blocksize = root->sectorsize;
        pgoff_t index = from >> PAGE_CACHE_SHIFT;
@@ -3127,12 +3091,14 @@ again:
        }
        wait_on_page_writeback(page);
-        lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+        lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+                         GFP_NOFS);
        set_page_extent_mapped(page);
        ordered = btrfs_lookup_ordered_extent(inode, page_start);
        if (ordered) {
-                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                unlock_extent_cached(io_tree, page_start, page_end,
+                                     &cached_state, GFP_NOFS);
                unlock_page(page);
                page_cache_release(page);
                btrfs_start_ordered_extent(inode, ordered, 1);
@@ -3140,13 +3106,15 @@ again:
                goto again;
        }
-        clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
                          EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
-                          GFP_NOFS);
+                          0, 0, &cached_state, GFP_NOFS);
-        ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+        ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+                                        &cached_state);
        if (ret) {
-                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                unlock_extent_cached(io_tree, page_start, page_end,
+                                     &cached_state, GFP_NOFS);
                goto out_unlock;
        }
@@ -3159,7 +3127,8 @@ again:
        }
        ClearPageChecked(page);
        set_page_dirty(page);
-        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+        unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
+                             GFP_NOFS);
 out_unlock:
        if (ret)
@@ -3177,6 +3146,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_map *em;
+        struct extent_state *cached_state = NULL;
        u64 mask = root->sectorsize - 1;
        u64 hole_start = (inode->i_size + mask) & ~mask;
        u64 block_end = (size + mask) & ~mask;
@@ -3192,11 +3162,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                struct btrfs_ordered_extent *ordered;
                btrfs_wait_ordered_range(inode, hole_start,
                                         block_end - hole_start);
-                lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+                lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+                                 &cached_state, GFP_NOFS);
                ordered = btrfs_lookup_ordered_extent(inode, hole_start);
                if (!ordered)
                        break;
-                unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+                unlock_extent_cached(io_tree, hole_start, block_end - 1,
+                                     &cached_state, GFP_NOFS);
                btrfs_put_ordered_extent(ordered);
        }
@@ -3241,7 +3213,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                        break;
        }
-        unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+        unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
+                             GFP_NOFS);
        return err;
 }
@@ -3639,6 +3612,7 @@ static noinline void init_btrfs_i(struct inode *inode)
        bi->index_cnt = (u64)-1;
        bi->last_unlink_trans = 0;
        bi->ordered_data_close = 0;
+        bi->force_compress = 0;
        extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
        extent_io_tree_init(&BTRFS_I(inode)->io_tree,
                             inode->i_mapping, GFP_NOFS);
@@ -3687,7 +3661,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
 * Returns in *is_new if the inode was read from disk
 */
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-                         struct btrfs_root *root)
+                         struct btrfs_root *root, int *new)
 {
        struct inode *inode;
@@ -3702,6 +3676,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                inode_tree_add(inode);
                unlock_new_inode(inode);
+                if (new)
+                        *new = 1;
        }
        return inode;
@@ -3754,7 +3730,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                return NULL;
        if (location.type == BTRFS_INODE_ITEM_KEY) {
-                inode = btrfs_iget(dir->i_sb, &location, root);
+                inode = btrfs_iget(dir->i_sb, &location, root, NULL);
                return inode;
        }
@@ -3769,7 +3745,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                else
                        inode = new_simple_dir(dir->i_sb, &location, sub_root);
        } else {
-                inode = btrfs_iget(dir->i_sb, &location, sub_root);
+                inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
        }
        srcu_read_unlock(&root->fs_info->subvol_srcu, index);
@@ -4501,7 +4477,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
        if (err) {
                err = -ENOSPC;
-                goto out_unlock;
+                goto out_fail;
        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
@@ -4979,6 +4955,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
        struct extent_io_tree *tree;
        struct btrfs_ordered_extent *ordered;
+        struct extent_state *cached_state = NULL;
        u64 page_start = page_offset(page);
        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
@@ -4997,7 +4974,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                btrfs_releasepage(page, GFP_NOFS);
                return;
        }
-        lock_extent(tree, page_start, page_end, GFP_NOFS);
+        lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+                         GFP_NOFS);
        ordered = btrfs_lookup_ordered_extent(page->mapping->host,
                                           page_offset(page));
        if (ordered) {
@@ -5008,7 +4986,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                clear_extent_bit(tree, page_start, page_end,
                                 EXTENT_DIRTY | EXTENT_DELALLOC |
                                 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
-                                 NULL, GFP_NOFS);
+                                 &cached_state, GFP_NOFS);
                /*
                 * whoever cleared the private bit is responsible
                 * for the finish_ordered_io
@@ -5018,11 +4996,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                                                page_start, page_end);
                }
                btrfs_put_ordered_extent(ordered);
-                lock_extent(tree, page_start, page_end, GFP_NOFS);
+                cached_state = NULL;
+                lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+                                 GFP_NOFS);
        }
        clear_extent_bit(tree, page_start, page_end,
                 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-                 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
+                 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
        __btrfs_releasepage(page, GFP_NOFS);
        ClearPageChecked(page);
@@ -5055,6 +5035,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
+        struct extent_state *cached_state = NULL;
        char *kaddr;
        unsigned long zero_start;
        loff_t size;
@@ -5093,7 +5074,8 @@ again:
        }
        wait_on_page_writeback(page);
-        lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+        lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+                         GFP_NOFS);
        set_page_extent_mapped(page);
        /*
@@ -5102,7 +5084,8 @@ again:
         */
        ordered = btrfs_lookup_ordered_extent(inode, page_start);
        if (ordered) {
-                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                unlock_extent_cached(io_tree, page_start, page_end,
+                                     &cached_state, GFP_NOFS);
                unlock_page(page);
                btrfs_start_ordered_extent(inode, ordered, 1);
                btrfs_put_ordered_extent(ordered);
@@ -5116,13 +5099,15 @@ again:
         * is probably a better way to do this, but for now keep consistent with
         * prepare_pages in the normal write path.
         */
-        clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
                          EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
-                          GFP_NOFS);
+                          0, 0, &cached_state, GFP_NOFS);
-        ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+        ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+                                        &cached_state);
        if (ret) {
-                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                unlock_extent_cached(io_tree, page_start, page_end,
+                                     &cached_state, GFP_NOFS);
                ret = VM_FAULT_SIGBUS;
                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
                goto out_unlock;
@@ -5148,7 +5133,7 @@ again:
        BTRFS_I(inode)->last_trans = root->fs_info->generation;
        BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
-        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 out_unlock:
        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -5353,7 +5338,6 @@ free:
 void btrfs_drop_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
                generic_delete_inode(inode);
        else
@@ -5757,18 +5741,15 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
-        u64 alloc_size;
        u64 cur_offset = start;
        u64 num_bytes = end - start;
        int ret = 0;
        u64 i_size;
        while (num_bytes > 0) {
-                alloc_size = min(num_bytes, root->fs_info->max_extent);
                trans = btrfs_start_transaction(root, 1);
-                ret = btrfs_reserve_extent(trans, root, alloc_size,
+                ret = btrfs_reserve_extent(trans, root, num_bytes,
                                           root->sectorsize, 0, alloc_hint,
                                           (u64)-1, &ins, 1);
                if (ret) {
@@ -5827,6 +5808,7 @@ stop_trans:
 static long btrfs_fallocate(struct inode *inode, int mode,
                            loff_t offset, loff_t len)
 {
+        struct extent_state *cached_state = NULL;
        u64 cur_offset;
        u64 last_byte;
        u64 alloc_start;
@@ -5865,16 +5847,17 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                /* the extent lock is ordered inside the running
                 * transaction
                 */
-                lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
-                            GFP_NOFS);
+                                 locked_end, 0, &cached_state, GFP_NOFS);
                ordered = btrfs_lookup_first_ordered_extent(inode,
                                                            alloc_end - 1);
                if (ordered &&
                    ordered->file_offset + ordered->len > alloc_start &&
                    ordered->file_offset < alloc_end) {
                        btrfs_put_ordered_extent(ordered);
-                        unlock_extent(&BTRFS_I(inode)->io_tree,
+                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                      alloc_start, locked_end, GFP_NOFS);
+                                             alloc_start, locked_end,
+                                             &cached_state, GFP_NOFS);
                        /*
                         * we can't wait on the range with the transaction
                         * running or with the extent lock held
@@ -5916,8 +5899,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        break;
                }
        }
-        unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-                      GFP_NOFS);
+                             &cached_state, GFP_NOFS);
        btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
                                       alloc_end - alloc_start);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 645a17927a8f..97a97839a867 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
 #include <linux/security.h>
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -474,7 +475,79 @@ out_unlock:
        return error;
 }
-static int btrfs_defrag_file(struct file *file)
+static int should_defrag_range(struct inode *inode, u64 start, u64 len,
+                               int thresh, u64 *last_len, u64 *skip,
+                               u64 *defrag_end)
+{
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct extent_map *em = NULL;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        int ret = 1;
+        if (thresh == 0)
+                thresh = 256 * 1024;
+        /*
+         * make sure that once we start defragging and extent, we keep on
+         * defragging it
+         */
+        if (start < *defrag_end)
+                return 1;
+        *skip = 0;
+        /*
+         * hopefully we have this extent in the tree already, try without
+         * the full extent lock
+         */
+        read_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, start, len);
+        read_unlock(&em_tree->lock);
+        if (!em) {
+                /* get the big lock and read metadata off disk */
+                lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+                em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+                unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+                if (IS_ERR(em))
+                        return 0;
+        }
+        /* this will cover holes, and inline extents */
+        if (em->block_start >= EXTENT_MAP_LAST_BYTE)
+                ret = 0;
+        /*
+         * we hit a real extent, if it is big don't bother defragging it again
+         */
+        if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
+                ret = 0;
+        /*
+         * last_len ends up being a counter of how many bytes we've defragged.
+         * every time we choose not to defrag an extent, we reset *last_len
+         * so that the next tiny extent will force a defrag.
+         *
+         * The end result of this is that tiny extents before a single big
+         * extent will force at least part of that big extent to be defragged.
+         */
+        if (ret) {
+                *last_len += len;
+                *defrag_end = extent_map_end(em);
+        } else {
+                *last_len = 0;
+                *skip = extent_map_end(em);
+                *defrag_end = 0;
+        }
+        free_extent_map(em);
+        return ret;
+}
+static int btrfs_defrag_file(struct file *file,
+                             struct btrfs_ioctl_defrag_range_args *range)
 {
        struct inode *inode = fdentry(file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -486,37 +559,96 @@ static int btrfs_defrag_file(struct file *file)
        unsigned long total_read = 0;
        u64 page_start;
        u64 page_end;
+        u64 last_len = 0;
+        u64 skip = 0;
+        u64 defrag_end = 0;
        unsigned long i;
        int ret;
-        ret = btrfs_check_data_free_space(root, inode, inode->i_size);
+        if (inode->i_size == 0)
-        if (ret)
+                return 0;
-                return -ENOSPC;
+        if (range->start + range->len > range->start) {
+                last_index = min_t(u64, inode->i_size - 1,
+                         range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
+        } else {
+                last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+        }
+        i = range->start >> PAGE_CACHE_SHIFT;
+        while (i <= last_index) {
+                if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+                                        PAGE_CACHE_SIZE,
+                                        range->extent_thresh,
+                                        &last_len, &skip,
+                                        &defrag_end)) {
+                        unsigned long next;
+                        /*
+                         * the should_defrag function tells us how much to skip
+                         * bump our counter by the suggested amount
+                         */
+                        next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                        i = max(i + 1, next);
+                        continue;
+                }
-        mutex_lock(&inode->i_mutex);
-        last_index = inode->i_size >> PAGE_CACHE_SHIFT;
-        for (i = 0; i <= last_index; i++) {
                if (total_read % ra_pages == 0) {
                        btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
                                       min(last_index, i + ra_pages - 1));
                }
                total_read++;
+                mutex_lock(&inode->i_mutex);
+                if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
+                        BTRFS_I(inode)->force_compress = 1;
+                ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+                if (ret) {
+                        ret = -ENOSPC;
+                        break;
+                }
+                ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+                if (ret) {
+                        btrfs_free_reserved_data_space(root, inode,
+                                                       PAGE_CACHE_SIZE);
+                        ret = -ENOSPC;
+                        break;
+                }
 again:
+                if (inode->i_size == 0 ||
+                    i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
+                        ret = 0;
+                        goto err_reservations;
+                }
                page = grab_cache_page(inode->i_mapping, i);
                if (!page)
-                        goto out_unlock;
+                        goto err_reservations;
                if (!PageUptodate(page)) {
                        btrfs_readpage(NULL, page);
                        lock_page(page);
                        if (!PageUptodate(page)) {
                                unlock_page(page);
                                page_cache_release(page);
-                                goto out_unlock;
+                                goto err_reservations;
                        }
                }
+                if (page->mapping != inode->i_mapping) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                        goto again;
+                }
                wait_on_page_writeback(page);
+                if (PageDirty(page)) {
+                        btrfs_free_reserved_data_space(root, inode,
+                                                       PAGE_CACHE_SIZE);
+                        goto loop_unlock;
+                }
                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
                page_end = page_start + PAGE_CACHE_SIZE - 1;
                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -537,18 +669,54 @@ again:
                 * page if it is dirtied again later
                 */
                clear_page_dirty_for_io(page);
+                clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
+                                  page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
+                                  EXTENT_DO_ACCOUNTING, GFP_NOFS);
-                btrfs_set_extent_delalloc(inode, page_start, page_end);
+                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
+                ClearPageChecked(page);
                set_page_dirty(page);
                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+loop_unlock:
                unlock_page(page);
                page_cache_release(page);
+                mutex_unlock(&inode->i_mutex);
+                btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
                balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+                i++;
+        }
+        if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
+                filemap_flush(inode->i_mapping);
+        if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+                /* the filemap_flush will queue IO into the worker threads, but
+                 * we have to make sure the IO is actually started and that
+                 * ordered extents get created before we return
+                 */
+                atomic_inc(&root->fs_info->async_submit_draining);
+                while (atomic_read(&root->fs_info->nr_async_submits) ||
+                      atomic_read(&root->fs_info->async_delalloc_pages)) {
+                        wait_event(root->fs_info->async_submit_wait,
+                           (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+                            atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+                }
+                atomic_dec(&root->fs_info->async_submit_draining);
+                mutex_lock(&inode->i_mutex);
+                BTRFS_I(inode)->force_compress = 0;
+                mutex_unlock(&inode->i_mutex);
        }
-out_unlock:
-        mutex_unlock(&inode->i_mutex);
        return 0;
+err_reservations:
+        mutex_unlock(&inode->i_mutex);
+        btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+        return ret;
 }
 static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
@@ -608,7 +776,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                        mod = 1;
                        sizestr++;
                }
-                new_size = btrfs_parse_size(sizestr);
+                new_size = memparse(sizestr, NULL);
                if (new_size == 0) {
                        ret = -EINVAL;
                        goto out_unlock;
@@ -743,6 +911,330 @@ out:
        return ret;
 }
+static noinline int key_in_sk(struct btrfs_key *key,
+                              struct btrfs_ioctl_search_key *sk)
+{
+        struct btrfs_key test;
+        int ret;
+        test.objectid = sk->min_objectid;
+        test.type = sk->min_type;
+        test.offset = sk->min_offset;
+        ret = btrfs_comp_cpu_keys(key, &test);
+        if (ret < 0)
+                return 0;
+        test.objectid = sk->max_objectid;
+        test.type = sk->max_type;
+        test.offset = sk->max_offset;
+        ret = btrfs_comp_cpu_keys(key, &test);
+        if (ret > 0)
+                return 0;
+        return 1;
+}
+static noinline int copy_to_sk(struct btrfs_root *root,
+                               struct btrfs_path *path,
+                               struct btrfs_key *key,
+                               struct btrfs_ioctl_search_key *sk,
+                               char *buf,
+                               unsigned long *sk_offset,
+                               int *num_found)
+{
+        u64 found_transid;
+        struct extent_buffer *leaf;
+        struct btrfs_ioctl_search_header sh;
+        unsigned long item_off;
+        unsigned long item_len;
+        int nritems;
+        int i;
+        int slot;
+        int found = 0;
+        int ret = 0;
+        leaf = path->nodes[0];
+        slot = path->slots[0];
+        nritems = btrfs_header_nritems(leaf);
+        if (btrfs_header_generation(leaf) > sk->max_transid) {
+                i = nritems;
+                goto advance_key;
+        }
+        found_transid = btrfs_header_generation(leaf);
+        for (i = slot; i < nritems; i++) {
+                item_off = btrfs_item_ptr_offset(leaf, i);
+                item_len = btrfs_item_size_nr(leaf, i);
+                if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+                        item_len = 0;
+                if (sizeof(sh) + item_len + *sk_offset >
+                    BTRFS_SEARCH_ARGS_BUFSIZE) {
+                        ret = 1;
+                        goto overflow;
+                }
+                btrfs_item_key_to_cpu(leaf, key, i);
+                if (!key_in_sk(key, sk))
+                        continue;
+                sh.objectid = key->objectid;
+                sh.offset = key->offset;
+                sh.type = key->type;
+                sh.len = item_len;
+                sh.transid = found_transid;
+                /* copy search result header */
+                memcpy(buf + *sk_offset, &sh, sizeof(sh));
+                *sk_offset += sizeof(sh);
+                if (item_len) {
+                        char *p = buf + *sk_offset;
+                        /* copy the item */
+                        read_extent_buffer(leaf, p,
+                                           item_off, item_len);
+                        *sk_offset += item_len;
+                }
+                found++;
+                if (*num_found >= sk->nr_items)
+                        break;
+        }
+advance_key:
+        ret = 0;
+        if (key->offset < (u64)-1 && key->offset < sk->max_offset)
+                key->offset++;
+        else if (key->type < (u8)-1 && key->type < sk->max_type) {
+                key->offset = 0;
+                key->type++;
+        } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
+                key->offset = 0;
+                key->type = 0;
+                key->objectid++;
+        } else
+                ret = 1;
+overflow:
+        *num_found += found;
+        return ret;
+}
+static noinline int search_ioctl(struct inode *inode,
+                                 struct btrfs_ioctl_search_args *args)
+{
+        struct btrfs_root *root;
+        struct btrfs_key key;
+        struct btrfs_key max_key;
+        struct btrfs_path *path;
+        struct btrfs_ioctl_search_key *sk = &args->key;
+        struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
+        int ret;
+        int num_found = 0;
+        unsigned long sk_offset = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        if (sk->tree_id == 0) {
+                /* search the root of the inode that was passed */
+                root = BTRFS_I(inode)->root;
+        } else {
+                key.objectid = sk->tree_id;
+                key.type = BTRFS_ROOT_ITEM_KEY;
+                key.offset = (u64)-1;
+                root = btrfs_read_fs_root_no_name(info, &key);
+                if (IS_ERR(root)) {
+                        printk(KERN_ERR "could not find root %llu\n",
+                               sk->tree_id);
+                        btrfs_free_path(path);
+                        return -ENOENT;
+                }
+        }
+        key.objectid = sk->min_objectid;
+        key.type = sk->min_type;
+        key.offset = sk->min_offset;
+        max_key.objectid = sk->max_objectid;
+        max_key.type = sk->max_type;
+        max_key.offset = sk->max_offset;
+        path->keep_locks = 1;
+        while(1) {
+                ret = btrfs_search_forward(root, &key, &max_key, path, 0,
+                                           sk->min_transid);
+                if (ret != 0) {
+                        if (ret > 0)
+                                ret = 0;
+                        goto err;
+                }
+                ret = copy_to_sk(root, path, &key, sk, args->buf,
+                                 &sk_offset, &num_found);
+                btrfs_release_path(root, path);
+                if (ret || num_found >= sk->nr_items)
+                        break;
+        }
+        ret = 0;
+err:
+        sk->nr_items = num_found;
+        btrfs_free_path(path);
+        return ret;
+}
+static noinline int btrfs_ioctl_tree_search(struct file *file,
+                                           void __user *argp)
+{
+         struct btrfs_ioctl_search_args *args;
+         struct inode *inode;
+         int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        args = kmalloc(sizeof(*args), GFP_KERNEL);
+        if (!args)
+                return -ENOMEM;
+        if (copy_from_user(args, argp, sizeof(*args))) {
+                kfree(args);
+                return -EFAULT;
+        }
+        inode = fdentry(file)->d_inode;
+        ret = search_ioctl(inode, args);
+        if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+                ret = -EFAULT;
+        kfree(args);
+        return ret;
+}
+/*
+ * Search INODE_REFs to identify path name of 'dirid' directory
+ * in a 'tree_id' tree. and sets path name to 'name'.
+ */
+static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
+                                u64 tree_id, u64 dirid, char *name)
+{
+        struct btrfs_root *root;
+        struct btrfs_key key;
+        char *ptr;
+        int ret = -1;
+        int slot;
+        int len;
+        int total_len = 0;
+        struct btrfs_inode_ref *iref;
+        struct extent_buffer *l;
+        struct btrfs_path *path;
+        if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
+                name[0]='\0';
+                return 0;
+        }
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
+        key.objectid = tree_id;
+        key.type = BTRFS_ROOT_ITEM_KEY;
+        key.offset = (u64)-1;
+        root = btrfs_read_fs_root_no_name(info, &key);
+        if (IS_ERR(root)) {
+                printk(KERN_ERR "could not find root %llu\n", tree_id);
+                ret = -ENOENT;
+                goto out;
+        }
+        key.objectid = dirid;
+        key.type = BTRFS_INODE_REF_KEY;
+        key.offset = (u64)-1;
+        while(1) {
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+                l = path->nodes[0];
+                slot = path->slots[0];
+                if (ret > 0 && slot > 0)
+                        slot--;
+                btrfs_item_key_to_cpu(l, &key, slot);
+                if (ret > 0 && (key.objectid != dirid ||
+                                key.type != BTRFS_INODE_REF_KEY)) {
+                        ret = -ENOENT;
+                        goto out;
+                }
+                iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
+                len = btrfs_inode_ref_name_len(l, iref);
+                ptr -= len + 1;
+                total_len += len + 1;
+                if (ptr < name)
+                        goto out;
+                *(ptr + len) = '/';
+                read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
+                if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
+                        break;
+                btrfs_release_path(root, path);
+                key.objectid = key.offset;
+                key.offset = (u64)-1;
+                dirid = key.objectid;
+        }
+        if (ptr < name)
+                goto out;
+        memcpy(name, ptr, total_len);
+        name[total_len]='\0';
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static noinline int btrfs_ioctl_ino_lookup(struct file *file,
+                                           void __user *argp)
+{
+         struct btrfs_ioctl_ino_lookup_args *args;
+         struct inode *inode;
+         int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        args = kmalloc(sizeof(*args), GFP_KERNEL);
+        if (!args)
+                return -ENOMEM;
+        if (copy_from_user(args, argp, sizeof(*args))) {
+                kfree(args);
+                return -EFAULT;
+        }
+        inode = fdentry(file)->d_inode;
+        if (args->treeid == 0)
+                args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+        ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
+                                        args->treeid, args->objectid,
+                                        args->name);
+        if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+                ret = -EFAULT;
+        kfree(args);
+        return ret;
+}
 static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                                             void __user *arg)
 {
@@ -849,10 +1341,11 @@ out:
        return err;
 }
-static int btrfs_ioctl_defrag(struct file *file)
+static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 {
        struct inode *inode = fdentry(file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_ioctl_defrag_range_args *range;
        int ret;
        ret = mnt_want_write(file->f_path.mnt);
@@ -873,7 +1366,31 @@ static int btrfs_ioctl_defrag(struct file *file)
                        ret = -EINVAL;
                        goto out;
                }
-                btrfs_defrag_file(file);
+                range = kzalloc(sizeof(*range), GFP_KERNEL);
+                if (!range) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                if (argp) {
+                        if (copy_from_user(range, argp,
+                                           sizeof(*range))) {
+                                ret = -EFAULT;
+                                kfree(range);
+                                goto out;
+                        }
+                        /* compression requires us to start the IO */
+                        if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+                                range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
+                                range->extent_thresh = (u32)-1;
+                        }
+                } else {
+                        /* the rest are all set to zero by kzalloc */
+                        range->len = (u64)-1;
+                }
+                btrfs_defrag_file(file, range);
+                kfree(range);
                break;
        }
 out:
@@ -964,12 +1481,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                ret = -EBADF;
                goto out_drop_write;
        }
        src = src_file->f_dentry->d_inode;
        ret = -EINVAL;
        if (src == inode)
                goto out_fput;
+        /* the src must be open for reading */
+        if (!(src_file->f_mode & FMODE_READ))
+                goto out_fput;
        ret = -EISDIR;
        if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
                goto out_fput;
@@ -1274,6 +1796,157 @@ out:
        return ret;
 }
+static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_root *new_root;
+        struct btrfs_dir_item *di;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_path *path;
+        struct btrfs_key location;
+        struct btrfs_disk_key disk_key;
+        struct btrfs_super_block *disk_super;
+        u64 features;
+        u64 objectid = 0;
+        u64 dir_id;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (copy_from_user(&objectid, argp, sizeof(objectid)))
+                return -EFAULT;
+        if (!objectid)
+                objectid = root->root_key.objectid;
+        location.objectid = objectid;
+        location.type = BTRFS_ROOT_ITEM_KEY;
+        location.offset = (u64)-1;
+        new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+        if (IS_ERR(new_root))
+                return PTR_ERR(new_root);
+        if (btrfs_root_refs(&new_root->root_item) == 0)
+                return -ENOENT;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        trans = btrfs_start_transaction(root, 1);
+        if (!trans) {
+                btrfs_free_path(path);
+                return -ENOMEM;
+        }
+        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+        di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
+                                   dir_id, "default", 7, 1);
+        if (!di) {
+                btrfs_free_path(path);
+                btrfs_end_transaction(trans, root);
+                printk(KERN_ERR "Umm, you don't have the default dir item, "
+                       "this isn't going to work\n");
+                return -ENOENT;
+        }
+        btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
+        btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_free_path(path);
+        disk_super = &root->fs_info->super_copy;
+        features = btrfs_super_incompat_flags(disk_super);
+        if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
+                features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
+                btrfs_set_super_incompat_flags(disk_super, features);
+        }
+        btrfs_end_transaction(trans, root);
+        return 0;
+}
+long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+{
+        struct btrfs_ioctl_space_args space_args;
+        struct btrfs_ioctl_space_info space;
+        struct btrfs_ioctl_space_info *dest;
+        struct btrfs_ioctl_space_info *dest_orig;
+        struct btrfs_ioctl_space_info *user_dest;
+        struct btrfs_space_info *info;
+        int alloc_size;
+        int ret = 0;
+        int slot_count = 0;
+        if (copy_from_user(&space_args,
+                           (struct btrfs_ioctl_space_args __user *)arg,
+                           sizeof(space_args)))
+                return -EFAULT;
+        /* first we count slots */
+        rcu_read_lock();
+        list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
+                slot_count++;
+        rcu_read_unlock();
+        /* space_slots == 0 means they are asking for a count */
+        if (space_args.space_slots == 0) {
+                space_args.total_spaces = slot_count;
+                goto out;
+        }
+        alloc_size = sizeof(*dest) * slot_count;
+        /* we generally have at most 6 or so space infos, one for each raid
+         * level.  So, a whole page should be more than enough for everyone
+         */
+        if (alloc_size > PAGE_CACHE_SIZE)
+                return -ENOMEM;
+        space_args.total_spaces = 0;
+        dest = kmalloc(alloc_size, GFP_NOFS);
+        if (!dest)
+                return -ENOMEM;
+        dest_orig = dest;
+        /* now we have a buffer to copy into */
+        rcu_read_lock();
+        list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
+                /* make sure we don't copy more than we allocated
+                 * in our buffer
+                 */
+                if (slot_count == 0)
+                        break;
+                slot_count--;
+                /* make sure userland has enough room in their buffer */
+                if (space_args.total_spaces >= space_args.space_slots)
+                        break;
+                space.flags = info->flags;
+                space.total_bytes = info->total_bytes;
+                space.used_bytes = info->bytes_used;
+                memcpy(dest, &space, sizeof(space));
+                dest++;
+                space_args.total_spaces++;
+        }
+        rcu_read_unlock();
+        user_dest = (struct btrfs_ioctl_space_info *)
+                (arg + sizeof(struct btrfs_ioctl_space_args));
+        if (copy_to_user(user_dest, dest_orig, alloc_size))
+                ret = -EFAULT;
+        kfree(dest_orig);
+out:
+        if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
+                ret = -EFAULT;
+        return ret;
+}
 /*
 * there are many ways the trans_start and trans_end ioctls can lead
 * to deadlocks.  They should only be used by applications that
@@ -1320,8 +1993,12 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_snap_create(file, argp, 1);
        case BTRFS_IOC_SNAP_DESTROY:
                return btrfs_ioctl_snap_destroy(file, argp);
+        case BTRFS_IOC_DEFAULT_SUBVOL:
+                return btrfs_ioctl_default_subvol(file, argp);
        case BTRFS_IOC_DEFRAG:
-                return btrfs_ioctl_defrag(file);
+                return btrfs_ioctl_defrag(file, NULL);
+        case BTRFS_IOC_DEFRAG_RANGE:
+                return btrfs_ioctl_defrag(file, argp);
        case BTRFS_IOC_RESIZE:
                return btrfs_ioctl_resize(root, argp);
        case BTRFS_IOC_ADD_DEV:
@@ -1338,6 +2015,12 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_trans_start(file);
        case BTRFS_IOC_TRANS_END:
                return btrfs_ioctl_trans_end(file);
+        case BTRFS_IOC_TREE_SEARCH:
+                return btrfs_ioctl_tree_search(file, argp);
+        case BTRFS_IOC_INO_LOOKUP:
+                return btrfs_ioctl_ino_lookup(file, argp);
+        case BTRFS_IOC_SPACE_INFO:
+                return btrfs_ioctl_space_info(root, argp);
        case BTRFS_IOC_SYNC:
                btrfs_sync_fs(file->f_dentry->d_sb, 1);
                return 0;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index bc49914475eb..424694aa517f 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,12 +30,114 @@ struct btrfs_ioctl_vol_args {
        char name[BTRFS_PATH_NAME_MAX + 1];
 };
+#define BTRFS_INO_LOOKUP_PATH_MAX 4080
+struct btrfs_ioctl_ino_lookup_args {
+        __u64 treeid;
+        __u64 objectid;
+        char name[BTRFS_INO_LOOKUP_PATH_MAX];
+};
+struct btrfs_ioctl_search_key {
+        /* which root are we searching.  0 is the tree of tree roots */
+        __u64 tree_id;
+        /* keys returned will be >= min and <= max */
+        __u64 min_objectid;
+        __u64 max_objectid;
+        /* keys returned will be >= min and <= max */
+        __u64 min_offset;
+        __u64 max_offset;
+        /* max and min transids to search for */
+        __u64 min_transid;
+        __u64 max_transid;
+        /* keys returned will be >= min and <= max */
+        __u32 min_type;
+        __u32 max_type;
+        /*
+         * how many items did userland ask for, and how many are we
+         * returning
+         */
+        __u32 nr_items;
+        /* align to 64 bits */
+        __u32 unused;
+        /* some extra for later */
+        __u64 unused1;
+        __u64 unused2;
+        __u64 unused3;
+        __u64 unused4;
+};
+struct btrfs_ioctl_search_header {
+        __u64 transid;
+        __u64 objectid;
+        __u64 offset;
+        __u32 type;
+        __u32 len;
+};
+#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
+/*
+ * the buf is an array of search headers where
+ * each header is followed by the actual item
+ * the type field is expanded to 32 bits for alignment
+ */
+struct btrfs_ioctl_search_args {
+        struct btrfs_ioctl_search_key key;
+        char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
+};
 struct btrfs_ioctl_clone_range_args {
  __s64 src_fd;
  __u64 src_offset, src_length;
  __u64 dest_offset;
 };
+/* flags for the defrag range ioctl */
+#define BTRFS_DEFRAG_RANGE_COMPRESS 1
+#define BTRFS_DEFRAG_RANGE_START_IO 2
+struct btrfs_ioctl_defrag_range_args {
+        /* start of the defrag operation */
+        __u64 start;
+        /* number of bytes to defrag, use (u64)-1 to say all */
+        __u64 len;
+        /*
+         * flags for the operation, which can include turning
+         * on compression for this one defrag
+         */
+        __u64 flags;
+        /*
+         * any extent bigger than this will be considered
+         * already defragged.  Use 0 to take the kernel default
+         * Use 1 to say every single extent must be rewritten
+         */
+        __u32 extent_thresh;
+        /* spare for later */
+        __u32 unused[5];
+};
+struct btrfs_ioctl_space_info {
+        __u64 flags;
+        __u64 total_bytes;
+        __u64 used_bytes;
+};
+struct btrfs_ioctl_space_args {
+        __u64 space_slots;
+        __u64 total_spaces;
+        struct btrfs_ioctl_space_info spaces[0];
+};
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
                                   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -67,4 +169,13 @@ struct btrfs_ioctl_clone_range_args {
                                   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
                                struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
+                                struct btrfs_ioctl_defrag_range_args)
+#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
+                                   struct btrfs_ioctl_search_args)
+#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
+                                   struct btrfs_ioctl_ino_lookup_args)
+#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
+#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
+                                    struct btrfs_ioctl_space_args)
 #endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
 * Boston, MA 021110-1307, USA.
 */
 #include <linux/sched.h>
-#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 #include <linux/page-flags.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c2a9e78a949..a127c0ebb2dc 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
 * Boston, MA 021110-1307, USA.
 */
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
@@ -174,7 +173,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        if (!entry)
                return -ENOMEM;
-        mutex_lock(&tree->mutex);
        entry->file_offset = file_offset;
        entry->start = start;
        entry->len = len;
@@ -190,16 +188,17 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        INIT_LIST_HEAD(&entry->list);
        INIT_LIST_HEAD(&entry->root_extent_list);
+        spin_lock(&tree->lock);
        node = tree_insert(&tree->tree, file_offset,
                           &entry->rb_node);
        BUG_ON(node);
+        spin_unlock(&tree->lock);
        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
        list_add_tail(&entry->root_extent_list,
                      &BTRFS_I(inode)->root->fs_info->ordered_extents);
        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
-        mutex_unlock(&tree->mutex);
        BUG_ON(node);
        return 0;
 }
@@ -216,9 +215,9 @@ int btrfs_add_ordered_sum(struct inode *inode,
        struct btrfs_ordered_inode_tree *tree;
        tree = &BTRFS_I(inode)->ordered_tree;
-        mutex_lock(&tree->mutex);
+        spin_lock(&tree->lock);
        list_add_tail(&sum->list, &entry->list);
-        mutex_unlock(&tree->mutex);
+        spin_unlock(&tree->lock);
        return 0;
 }
@@ -232,15 +231,16 @@ int btrfs_add_ordered_sum(struct inode *inode,
 * to make sure this function only returns 1 once for a given ordered extent.
 */
 int btrfs_dec_test_ordered_pending(struct inode *inode,
+                                   struct btrfs_ordered_extent **cached,
                                   u64 file_offset, u64 io_size)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
-        struct btrfs_ordered_extent *entry;
+        struct btrfs_ordered_extent *entry = NULL;
        int ret;
        tree = &BTRFS_I(inode)->ordered_tree;
-        mutex_lock(&tree->mutex);
+        spin_lock(&tree->lock);
        node = tree_search(tree, file_offset);
        if (!node) {
                ret = 1;
@@ -264,7 +264,11 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
        else
                ret = 1;
 out:
-        mutex_unlock(&tree->mutex);
+        if (!ret && cached && entry) {
+                *cached = entry;
+                atomic_inc(&entry->refs);
+        }
+        spin_unlock(&tree->lock);
        return ret == 0;
 }
@@ -291,13 +295,14 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 /*
 * remove an ordered extent from the tree.  No references are dropped
- * and you must wake_up entry->wait.  You must hold the tree mutex
+ * and you must wake_up entry->wait.  You must hold the tree lock
 * while you call this function.
 */
 static int __btrfs_remove_ordered_extent(struct inode *inode,
                                struct btrfs_ordered_extent *entry)
 {
        struct btrfs_ordered_inode_tree *tree;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct rb_node *node;
        tree = &BTRFS_I(inode)->ordered_tree;
@@ -307,12 +312,13 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        WARN_ON(!BTRFS_I(inode)->outstanding_extents);
        BTRFS_I(inode)->outstanding_extents--;
        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
                                              inode, 1);
-        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        spin_lock(&root->fs_info->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
        /*
@@ -324,7 +330,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
            !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
                list_del_init(&BTRFS_I(inode)->ordered_operations);
        }
-        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        spin_unlock(&root->fs_info->ordered_extent_lock);
        return 0;
 }
@@ -340,9 +346,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
        int ret;
        tree = &BTRFS_I(inode)->ordered_tree;
-        mutex_lock(&tree->mutex);
+        spin_lock(&tree->lock);
        ret = __btrfs_remove_ordered_extent(inode, entry);
-        mutex_unlock(&tree->mutex);
+        spin_unlock(&tree->lock);
        wake_up(&entry->wait);
        return ret;
@@ -567,7 +573,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
        struct btrfs_ordered_extent *entry = NULL;
        tree = &BTRFS_I(inode)->ordered_tree;
-        mutex_lock(&tree->mutex);
+        spin_lock(&tree->lock);
        node = tree_search(tree, file_offset);
        if (!node)
                goto out;
@@ -578,7 +584,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
        if (entry)
                atomic_inc(&entry->refs);
 out:
-        mutex_unlock(&tree->mutex);
+        spin_unlock(&tree->lock);
        return entry;
 }
@@ -594,7 +600,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
        struct btrfs_ordered_extent *entry = NULL;
        tree = &BTRFS_I(inode)->ordered_tree;
-        mutex_lock(&tree->mutex);
+        spin_lock(&tree->lock);
        node = tree_search(tree, file_offset);
        if (!node)
                goto out;
@@ -602,7 +608,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
        atomic_inc(&entry->refs);
 out:
-        mutex_unlock(&tree->mutex);
+        spin_unlock(&tree->lock);
        return entry;
 }
@@ -629,7 +635,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
        else
                offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
-        mutex_lock(&tree->mutex);
+        spin_lock(&tree->lock);
        disk_i_size = BTRFS_I(inode)->disk_i_size;
        /* truncate file */
@@ -735,7 +741,7 @@ out:
         */
        if (ordered)
                __btrfs_remove_ordered_extent(inode, ordered);
-        mutex_unlock(&tree->mutex);
+        spin_unlock(&tree->lock);
        if (ordered)
                wake_up(&ordered->wait);
        return ret;
@@ -762,7 +768,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
        if (!ordered)
                return 1;
-        mutex_lock(&tree->mutex);
+        spin_lock(&tree->lock);
        list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
                if (disk_bytenr >= ordered_sum->bytenr) {
                        num_sectors = ordered_sum->len / sectorsize;
@@ -777,7 +783,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
                }
        }
 out:
-        mutex_unlock(&tree->mutex);
+        spin_unlock(&tree->lock);
        btrfs_put_ordered_extent(ordered);
        return ret;
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 1fe1282ef47c..c82f76a9f040 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -21,7 +21,7 @@
 /* one of these per inode */
 struct btrfs_ordered_inode_tree {
-        struct mutex mutex;
+        spinlock_t lock;
        struct rb_root tree;
        struct rb_node *last;
 };
@@ -128,8 +128,8 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
 static inline void
 btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
 {
-        mutex_init(&t->mutex);
+        spin_lock_init(&t->lock);
-        t->tree.rb_node = NULL;
+        t->tree = RB_ROOT;
        t->last = NULL;
 }
@@ -137,7 +137,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
 int btrfs_remove_ordered_extent(struct inode *inode,
                                struct btrfs_ordered_extent *entry);
 int btrfs_dec_test_ordered_pending(struct inode *inode,
-                                       u64 file_offset, u64 io_size);
+                                   struct btrfs_ordered_extent **cached,
+                                   u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                             u64 start, u64 len, u64 disk_len, int tyep);
 int btrfs_add_ordered_sum(struct inode *inode,
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/sort.h>
 #include "ctree.h"
 #include "ref-cache.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index bc283ad2db73..e2a55cb2072b 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -52,7 +52,7 @@ static inline size_t btrfs_leaf_ref_size(int nr_extents)
 static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
 {
-        tree->root.rb_node = NULL;
+        tree->root = RB_ROOT;
        INIT_LIST_HEAD(&tree->list);
        spin_lock_init(&tree->lock);
 }
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ab7ab5318745..e558dd941ded 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/rbtree.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -170,14 +171,14 @@ struct async_merge {
 static void mapping_tree_init(struct mapping_tree *tree)
 {
-        tree->rb_root.rb_node = NULL;
+        tree->rb_root = RB_ROOT;
        spin_lock_init(&tree->lock);
 }
 static void backref_cache_init(struct backref_cache *cache)
 {
        int i;
-        cache->rb_root.rb_node = NULL;
+        cache->rb_root = RB_ROOT;
        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
                INIT_LIST_HEAD(&cache->pending[i]);
        spin_lock_init(&cache->lock);
@@ -2659,7 +2660,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
                                        EXTENT_BOUNDARY, GFP_NOFS);
                        nr++;
                }
-                btrfs_set_extent_delalloc(inode, page_start, page_end);
+                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
                set_page_dirty(page);
                dirty_page++;
@@ -3487,7 +3488,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
        key.objectid = objectid;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-        inode = btrfs_iget(root->fs_info->sb, &key, root);
+        inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
        BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
        BTRFS_I(inode)->index_cnt = group->key.objectid;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8a1ea6e64575..1866dff0538e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
 #include <linux/namei.h>
 #include <linux/miscdevice.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -63,22 +64,21 @@ static void btrfs_put_super(struct super_block *sb)
 }
 enum {
-        Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
+        Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
-        Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
+        Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
-        Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
+        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
-        Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio,
+        Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-        Opt_flushoncommit,
        Opt_discard, Opt_err,
 };
 static match_table_t tokens = {
        {Opt_degraded, "degraded"},
        {Opt_subvol, "subvol=%s"},
+        {Opt_subvolid, "subvolid=%d"},
        {Opt_device, "device=%s"},
        {Opt_nodatasum, "nodatasum"},
        {Opt_nodatacow, "nodatacow"},
        {Opt_nobarrier, "nobarrier"},
-        {Opt_max_extent, "max_extent=%s"},
        {Opt_max_inline, "max_inline=%s"},
        {Opt_alloc_start, "alloc_start=%s"},
        {Opt_thread_pool, "thread_pool=%d"},
@@ -95,31 +95,6 @@ static match_table_t tokens = {
        {Opt_err, NULL},
 };
-u64 btrfs_parse_size(char *str)
-{
-        u64 res;
-        int mult = 1;
-        char *end;
-        char last;
-        res = simple_strtoul(str, &end, 10);
-        last = end[0];
-        if (isalpha(last)) {
-                last = tolower(last);
-                switch (last) {
-                case 'g':
-                        mult *= 1024;
-                case 'm':
-                        mult *= 1024;
-                case 'k':
-                        mult *= 1024;
-                }
-                res = res * mult;
-        }
-        return res;
-}
 /*
 * Regular mount options parser.  Everything that is needed only when
 * reading in a new superblock is parsed here.
@@ -128,7 +103,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 {
        struct btrfs_fs_info *info = root->fs_info;
        substring_t args[MAX_OPT_ARGS];
-        char *p, *num;
+        char *p, *num, *orig;
        int intarg;
        int ret = 0;
@@ -143,6 +118,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
        if (!options)
                return -ENOMEM;
+        orig = options;
        while ((p = strsep(&options, ",")) != NULL) {
                int token;
@@ -156,6 +132,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        btrfs_set_opt(info->mount_opt, DEGRADED);
                        break;
                case Opt_subvol:
+                case Opt_subvolid:
                case Opt_device:
                        /*
                         * These are parsed by btrfs_parse_early_options
@@ -210,22 +187,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                       info->thread_pool_size);
                        }
                        break;
-                case Opt_max_extent:
-                        num = match_strdup(&args[0]);
-                        if (num) {
-                                info->max_extent = btrfs_parse_size(num);
-                                kfree(num);
-                                info->max_extent = max_t(u64,
-                                        info->max_extent, root->sectorsize);
-                                printk(KERN_INFO "btrfs: max_extent at %llu\n",
-                                       (unsigned long long)info->max_extent);
-                        }
-                        break;
                case Opt_max_inline:
                        num = match_strdup(&args[0]);
                        if (num) {
-                                info->max_inline = btrfs_parse_size(num);
+                                info->max_inline = memparse(num, NULL);
                                kfree(num);
                                if (info->max_inline) {
@@ -240,7 +205,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_alloc_start:
                        num = match_strdup(&args[0]);
                        if (num) {
-                                info->alloc_start = btrfs_parse_size(num);
+                                info->alloc_start = memparse(num, NULL);
                                kfree(num);
                                printk(KERN_INFO
                                        "btrfs: allocations start at %llu\n",
@@ -280,7 +245,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                }
        }
 out:
-        kfree(options);
+        kfree(orig);
        return ret;
 }
@@ -291,12 +256,13 @@ out:
 * only when we need to allocate a new super block.
 */
 static int btrfs_parse_early_options(const char *options, fmode_t flags,
-                void *holder, char **subvol_name,
+                void *holder, char **subvol_name, u64 *subvol_objectid,
                struct btrfs_fs_devices **fs_devices)
 {
        substring_t args[MAX_OPT_ARGS];
        char *opts, *p;
        int error = 0;
+        int intarg;
        if (!options)
                goto out;
@@ -319,6 +285,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
                case Opt_subvol:
                        *subvol_name = match_strdup(&args[0]);
                        break;
+                case Opt_subvolid:
+                        intarg = 0;
+                        error = match_int(&args[0], &intarg);
+                        if (!error) {
+                                /* we want the original fs_tree */
+                                if (!intarg)
+                                        *subvol_objectid =
+                                                BTRFS_FS_TREE_OBJECTID;
+                                else
+                                        *subvol_objectid = intarg;
+                        }
+                        break;
                case Opt_device:
                        error = btrfs_scan_one_device(match_strdup(&args[0]),
                                        flags, holder, fs_devices);
@@ -346,6 +324,110 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
        return error;
 }
+static struct dentry *get_default_root(struct super_block *sb,
+                                       u64 subvol_objectid)
+{
+        struct btrfs_root *root = sb->s_fs_info;
+        struct btrfs_root *new_root;
+        struct btrfs_dir_item *di;
+        struct btrfs_path *path;
+        struct btrfs_key location;
+        struct inode *inode;
+        struct dentry *dentry;
+        u64 dir_id;
+        int new = 0;
+        /*
+         * We have a specific subvol we want to mount, just setup location and
+         * go look up the root.
+         */
+        if (subvol_objectid) {
+                location.objectid = subvol_objectid;
+                location.type = BTRFS_ROOT_ITEM_KEY;
+                location.offset = (u64)-1;
+                goto find_root;
+        }
+        path = btrfs_alloc_path();
+        if (!path)
+                return ERR_PTR(-ENOMEM);
+        path->leave_spinning = 1;
+        /*
+         * Find the "default" dir item which points to the root item that we
+         * will mount by default if we haven't been given a specific subvolume
+         * to mount.
+         */
+        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+        di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+        if (!di) {
+                /*
+                 * Ok the default dir item isn't there.  This is weird since
+                 * it's always been there, but don't freak out, just try and
+                 * mount to root most subvolume.
+                 */
+                btrfs_free_path(path);
+                dir_id = BTRFS_FIRST_FREE_OBJECTID;
+                new_root = root->fs_info->fs_root;
+                goto setup_root;
+        }
+        btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+        btrfs_free_path(path);
+find_root:
+        new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+        if (IS_ERR(new_root))
+                return ERR_PTR(PTR_ERR(new_root));
+        if (btrfs_root_refs(&new_root->root_item) == 0)
+                return ERR_PTR(-ENOENT);
+        dir_id = btrfs_root_dirid(&new_root->root_item);
+setup_root:
+        location.objectid = dir_id;
+        location.type = BTRFS_INODE_ITEM_KEY;
+        location.offset = 0;
+        inode = btrfs_iget(sb, &location, new_root, &new);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        /*
+         * If we're just mounting the root most subvol put the inode and return
+         * a reference to the dentry.  We will have already gotten a reference
+         * to the inode in btrfs_fill_super so we're good to go.
+         */
+        if (!new && sb->s_root->d_inode == inode) {
+                iput(inode);
+                return dget(sb->s_root);
+        }
+        if (new) {
+                const struct qstr name = { .name = "/", .len = 1 };
+                /*
+                 * New inode, we need to make the dentry a sibling of s_root so
+                 * everything gets cleaned up properly on unmount.
+                 */
+                dentry = d_alloc(sb->s_root, &name);
+                if (!dentry) {
+                        iput(inode);
+                        return ERR_PTR(-ENOMEM);
+                }
+                d_splice_alias(inode, dentry);
+        } else {
+                /*
+                 * We found the inode in cache, just find a dentry for it and
+                 * put the reference to the inode we just got.
+                 */
+                dentry = d_find_alias(inode);
+                iput(inode);
+        }
+        return dentry;
+}
 static int btrfs_fill_super(struct super_block *sb,
                            struct btrfs_fs_devices *fs_devices,
                            void *data, int silent)
@@ -379,7 +461,7 @@ static int btrfs_fill_super(struct super_block *sb,
        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-        inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root);
+        inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto fail_close;
@@ -391,12 +473,6 @@ static int btrfs_fill_super(struct super_block *sb,
                err = -ENOMEM;
                goto fail_close;
        }
-#if 0
-        /* this does the super kobj at the same time */
-        err = btrfs_sysfs_add_super(tree_root->fs_info);
-        if (err)
-                goto fail_close;
-#endif
        sb->s_root = root_dentry;
@@ -440,9 +516,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",nodatacow");
        if (btrfs_test_opt(root, NOBARRIER))
                seq_puts(seq, ",nobarrier");
-        if (info->max_extent != (u64)-1)
-                seq_printf(seq, ",max_extent=%llu",
-                           (unsigned long long)info->max_extent);
        if (info->max_inline != 8192 * 1024)
                seq_printf(seq, ",max_inline=%llu",
                           (unsigned long long)info->max_inline);
@@ -488,19 +561,22 @@ static int btrfs_test_super(struct super_block *s, void *data)
 static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        char *subvol_name = NULL;
        struct block_device *bdev = NULL;
        struct super_block *s;
        struct dentry *root;
        struct btrfs_fs_devices *fs_devices = NULL;
        fmode_t mode = FMODE_READ;
+        char *subvol_name = NULL;
+        u64 subvol_objectid = 0;
        int error = 0;
+        int found = 0;
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
        error = btrfs_parse_early_options(data, mode, fs_type,
-                                          &subvol_name, &fs_devices);
+                                          &subvol_name, &subvol_objectid,
+                                          &fs_devices);
        if (error)
                return error;
@@ -529,6 +605,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                        goto error_close_devices;
                }
+                found = 1;
                btrfs_close_devices(fs_devices);
        } else {
                char b[BDEVNAME_SIZE];
@@ -546,25 +623,35 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                s->s_flags |= MS_ACTIVE;
        }
-        if (!strcmp(subvol_name, "."))
+        root = get_default_root(s, subvol_objectid);
-                root = dget(s->s_root);
+        if (IS_ERR(root)) {
-        else {
+                error = PTR_ERR(root);
-                mutex_lock(&s->s_root->d_inode->i_mutex);
+                deactivate_locked_super(s);
-                root = lookup_one_len(subvol_name, s->s_root,
+                goto error;
+        }
+        /* if they gave us a subvolume name bind mount into that */
+        if (strcmp(subvol_name, ".")) {
+                struct dentry *new_root;
+                mutex_lock(&root->d_inode->i_mutex);
+                new_root = lookup_one_len(subvol_name, root,
                                      strlen(subvol_name));
-                mutex_unlock(&s->s_root->d_inode->i_mutex);
+                mutex_unlock(&root->d_inode->i_mutex);
-                if (IS_ERR(root)) {
+                if (IS_ERR(new_root)) {
                        deactivate_locked_super(s);
-                        error = PTR_ERR(root);
+                        error = PTR_ERR(new_root);
-                        goto error_free_subvol_name;
+                        dput(root);
+                        goto error_close_devices;
                }
-                if (!root->d_inode) {
+                if (!new_root->d_inode) {
                        dput(root);
+                        dput(new_root);
                        deactivate_locked_super(s);
                        error = -ENXIO;
-                        goto error_free_subvol_name;
+                        goto error_close_devices;
                }
+                dput(root);
+                root = new_root;
        }
        mnt->mnt_sb = s;
@@ -579,6 +666,7 @@ error_close_devices:
        btrfs_close_devices(fs_devices);
 error_free_subvol_name:
        kfree(subvol_name);
+error:
        return error;
 }
@@ -623,14 +711,37 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct btrfs_root *root = btrfs_sb(dentry->d_sb);
        struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+        struct list_head *head = &root->fs_info->space_info;
+        struct btrfs_space_info *found;
+        u64 total_used = 0;
+        u64 data_used = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
+        rcu_read_lock();
+        list_for_each_entry_rcu(found, head, list) {
+                if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
+                                    BTRFS_BLOCK_GROUP_RAID10|
+                                    BTRFS_BLOCK_GROUP_RAID1)) {
+                        total_used += found->bytes_used;
+                        if (found->flags & BTRFS_BLOCK_GROUP_DATA)
+                                data_used += found->bytes_used;
+                        else
+                                data_used += found->total_bytes;
+                }
+                total_used += found->bytes_used;
+                if (found->flags & BTRFS_BLOCK_GROUP_DATA)
+                        data_used += found->bytes_used;
+                else
+                        data_used += found->total_bytes;
+        }
+        rcu_read_unlock();
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
-        buf->f_bfree = buf->f_blocks -
+        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-                (btrfs_super_bytes_used(disk_super) >> bits);
+        buf->f_bavail = buf->f_blocks - (data_used >> bits);
-        buf->f_bavail = buf->f_bfree;
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a240b6fa81df..4ce16ef702a3 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -164,12 +164,12 @@ static void btrfs_root_release(struct kobject *kobj)
        complete(&root->kobj_unregister);
 }
-static struct sysfs_ops btrfs_super_attr_ops = {
+static const struct sysfs_ops btrfs_super_attr_ops = {
        .show   = btrfs_super_attr_show,
        .store  = btrfs_super_attr_store,
 };
-static struct sysfs_ops btrfs_root_attr_ops = {
+static const struct sysfs_ops btrfs_root_attr_ops = {
        .show   = btrfs_root_attr_show,
        .store  = btrfs_root_attr_store,
 };
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b2acc79f1b34..2cb116099b90 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
@@ -69,7 +70,7 @@ static noinline int join_transaction(struct btrfs_root *root)
                cur_trans->commit_done = 0;
                cur_trans->start_time = get_seconds();
-                cur_trans->delayed_refs.root.rb_node = NULL;
+                cur_trans->delayed_refs.root = RB_ROOT;
                cur_trans->delayed_refs.num_entries = 0;
                cur_trans->delayed_refs.num_heads_ready = 0;
                cur_trans->delayed_refs.num_heads = 0;
@@ -147,18 +148,13 @@ static void wait_current_trans(struct btrfs_root *root)
                while (1) {
                        prepare_to_wait(&root->fs_info->transaction_wait, &wait,
                                        TASK_UNINTERRUPTIBLE);
-                        if (cur_trans->blocked) {
+                        if (!cur_trans->blocked)
-                                mutex_unlock(&root->fs_info->trans_mutex);
-                                schedule();
-                                mutex_lock(&root->fs_info->trans_mutex);
-                                finish_wait(&root->fs_info->transaction_wait,
-                                            &wait);
-                        } else {
-                                finish_wait(&root->fs_info->transaction_wait,
-                                            &wait);
                                break;
-                        }
+                        mutex_unlock(&root->fs_info->trans_mutex);
+                        schedule();
+                        mutex_lock(&root->fs_info->trans_mutex);
                }
+                finish_wait(&root->fs_info->transaction_wait, &wait);
                put_transaction(cur_trans);
        }
 }
@@ -760,10 +756,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct btrfs_root_item *new_root_item;
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_root *root = pending->root;
+        struct btrfs_root *parent_root;
+        struct inode *parent_inode;
        struct extent_buffer *tmp;
        struct extent_buffer *old;
        int ret;
        u64 objectid;
+        int namelen;
+        u64 index = 0;
+        parent_inode = pending->dentry->d_parent->d_inode;
+        parent_root = BTRFS_I(parent_inode)->root;
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
        if (!new_root_item) {
@@ -774,79 +777,59 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        if (ret)
                goto fail;
-        record_root_in_trans(trans, root);
-        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
-        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
        key.objectid = objectid;
        /* record when the snapshot was created in key.offset */
        key.offset = trans->transid;
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-        old = btrfs_lock_root_node(root);
-        btrfs_cow_block(trans, root, old, NULL, 0, &old);
-        btrfs_set_lock_blocking(old);
-        btrfs_copy_root(trans, root, old, &tmp, objectid);
-        btrfs_tree_unlock(old);
-        free_extent_buffer(old);
-        btrfs_set_root_node(new_root_item, tmp);
-        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
-                                new_root_item);
-        btrfs_tree_unlock(tmp);
-        free_extent_buffer(tmp);
-        if (ret)
-                goto fail;
-        key.offset = (u64)-1;
        memcpy(&pending->root_key, &key, sizeof(key));
-fail:
+        pending->root_key.offset = (u64)-1;
-        kfree(new_root_item);
-        return ret;
-}
-static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
-                                   struct btrfs_pending_snapshot *pending)
-{
-        int ret;
-        int namelen;
-        u64 index = 0;
-        struct btrfs_trans_handle *trans;
-        struct inode *parent_inode;
-        struct btrfs_root *parent_root;
-        parent_inode = pending->dentry->d_parent->d_inode;
-        parent_root = BTRFS_I(parent_inode)->root;
-        trans = btrfs_join_transaction(parent_root, 1);
+        record_root_in_trans(trans, parent_root);
        /*
         * insert the directory item
         */
        namelen = strlen(pending->name);
        ret = btrfs_set_inode_index(parent_inode, &index);
+        BUG_ON(ret);
        ret = btrfs_insert_dir_item(trans, parent_root,
                            pending->name, namelen,
                            parent_inode->i_ino,
                            &pending->root_key, BTRFS_FT_DIR, index);
+        BUG_ON(ret);
-        if (ret)
-                goto fail;
        btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
        ret = btrfs_update_inode(trans, parent_root, parent_inode);
        BUG_ON(ret);
+        record_root_in_trans(trans, root);
+        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
+        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+        old = btrfs_lock_root_node(root);
+        btrfs_cow_block(trans, root, old, NULL, 0, &old);
+        btrfs_set_lock_blocking(old);
+        btrfs_copy_root(trans, root, old, &tmp, objectid);
+        btrfs_tree_unlock(old);
+        free_extent_buffer(old);
+        btrfs_set_root_node(new_root_item, tmp);
+        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+                                new_root_item);
+        BUG_ON(ret);
+        btrfs_tree_unlock(tmp);
+        free_extent_buffer(tmp);
        ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
                                 pending->root_key.objectid,
                                 parent_root->root_key.objectid,
                                 parent_inode->i_ino, index, pending->name,
                                 namelen);
        BUG_ON(ret);
 fail:
-        btrfs_end_transaction(trans, fs_info->fs_root);
+        kfree(new_root_item);
        return ret;
 }
@@ -867,25 +850,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
        return 0;
 }
-static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
-                                             struct btrfs_fs_info *fs_info)
-{
-        struct btrfs_pending_snapshot *pending;
-        struct list_head *head = &trans->transaction->pending_snapshots;
-        int ret;
-        while (!list_empty(head)) {
-                pending = list_entry(head->next,
-                                     struct btrfs_pending_snapshot, list);
-                ret = finish_pending_snapshot(fs_info, pending);
-                BUG_ON(ret);
-                list_del(&pending->list);
-                kfree(pending->name);
-                kfree(pending);
-        }
-        return 0;
-}
 static void update_super_roots(struct btrfs_root *root)
 {
        struct btrfs_root_item *root_item;
@@ -997,13 +961,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                mutex_unlock(&root->fs_info->trans_mutex);
-                if (flush_on_commit) {
+                if (flush_on_commit || snap_pending) {
                        btrfs_start_delalloc_inodes(root, 1);
                        ret = btrfs_wait_ordered_extents(root, 0, 1);
                        BUG_ON(ret);
-                } else if (snap_pending) {
-                        ret = btrfs_wait_ordered_extents(root, 0, 1);
-                        BUG_ON(ret);
                }
                /*
@@ -1100,9 +1061,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_finish_extent_commit(trans, root);
-        /* do the directory inserts of any pending snapshot creations */
-        finish_pending_snapshots(trans, root->fs_info);
        mutex_lock(&root->fs_info->trans_mutex);
        cur_trans->commit_done = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a9434b622ec..af57dd2b43d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
@@ -445,7 +446,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
        key.objectid = objectid;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-        inode = btrfs_iget(root->fs_info->sb, &key, root);
+        inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
        if (IS_ERR(inode)) {
                inode = NULL;
        } else if (is_bad_inode(inode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 41ecbb2347f2..8db7b14bbae8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
@@ -256,13 +257,13 @@ loop_lock:
                        wake_up(&fs_info->async_submit_wait);
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-                submit_bio(cur->bi_rw, cur);
-                num_run++;
-                batch_run++;
                if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
                        num_sync_run++;
+                submit_bio(cur->bi_rw, cur);
+                num_run++;
+                batch_run++;
                if (need_resched()) {
                        if (num_sync_run) {
                                blk_run_backing_dev(bdi, NULL);
@@ -325,16 +326,6 @@ loop_lock:
                num_sync_run = 0;
                blk_run_backing_dev(bdi, NULL);
        }
-        cond_resched();
-        if (again)
-                goto loop;
-        spin_lock(&device->io_lock);
-        if (device->pending_bios.head || device->pending_sync_bios.head)
-                goto loop_lock;
-        spin_unlock(&device->io_lock);
        /*
         * IO has already been through a long path to get here.  Checksumming,
         * async helper threads, perhaps compression.  We've done a pretty
@@ -346,6 +337,16 @@ loop_lock:
         * cared about found its way down here.
         */
        blk_run_backing_dev(bdi, NULL);
+        cond_resched();
+        if (again)
+                goto loop;
+        spin_lock(&device->io_lock);
+        if (device->pending_bios.head || device->pending_sync_bios.head)
+                goto loop_lock;
+        spin_unlock(&device->io_lock);
 done:
        return 0;
 }
@@ -365,6 +366,7 @@ static noinline int device_list_add(const char *path,
        struct btrfs_device *device;
        struct btrfs_fs_devices *fs_devices;
        u64 found_transid = btrfs_super_generation(disk_super);
+        char *name;
        fs_devices = find_fsid(disk_super->fsid);
        if (!fs_devices) {
@@ -411,6 +413,12 @@ static noinline int device_list_add(const char *path,
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
+        } else if (strcmp(device->name, path)) {
+                name = kstrdup(path, GFP_NOFS);
+                if (!name)
+                        return -ENOMEM;
+                kfree(device->name);
+                device->name = name;
        }
        if (found_transid > fs_devices->latest_trans) {
@@ -592,7 +600,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                        goto error_close;
                disk_super = (struct btrfs_super_block *)bh->b_data;
-                devid = le64_to_cpu(disk_super->dev_item.devid);
+                devid = btrfs_stack_device_id(&disk_super->dev_item);
                if (devid != device->devid)
                        goto error_brelse;
@@ -694,7 +702,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                goto error_close;
        }
        disk_super = (struct btrfs_super_block *)bh->b_data;
-        devid = le64_to_cpu(disk_super->dev_item.devid);
+        devid = btrfs_stack_device_id(&disk_super->dev_item);
        transid = btrfs_super_generation(disk_super);
        if (disk_super->label[0])
                printk(KERN_INFO "device label %s ", disk_super->label);
@@ -1187,7 +1195,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                        goto error_close;
                }
                disk_super = (struct btrfs_super_block *)bh->b_data;
-                devid = le64_to_cpu(disk_super->dev_item.devid);
+                devid = btrfs_stack_device_id(&disk_super->dev_item);
                dev_uuid = disk_super->dev_item.uuid;
                device = btrfs_find_device(root, devid, dev_uuid,
                                           disk_super->fsid);
@@ -2191,9 +2199,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-                num_stripes = min_t(u64, 2, fs_devices->rw_devices);
+                if (fs_devices->rw_devices < 2)
-                if (num_stripes < 2)
                        return -ENOSPC;
+                num_stripes = 2;
                min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
@@ -2237,8 +2245,16 @@ again:
                do_div(calc_size, stripe_len);
                calc_size *= stripe_len;
        }
        /* we don't want tiny stripes */
-        calc_size = max_t(u64, min_stripe_size, calc_size);
+        if (!looped)
+                calc_size = max_t(u64, min_stripe_size, calc_size);
+        /*
+         * we're about to do_div by the stripe_len so lets make sure
+         * we end up with something bigger than a stripe
+         */
+        calc_size = max_t(u64, calc_size, stripe_len * 4);
        do_div(calc_size, stripe_len);
        calc_size *= stripe_len;
@@ -3382,6 +3398,8 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
        key.type = 0;
 again:
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto error;
        while (1) {
                leaf = path->nodes[0];
                slot = path->slots[0];
diff --git a/fs/buffer.c b/fs/buffer.c
index 6fa530256bfd..c9c266db0624 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2893,7 +2893,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
        /*
         * The page straddles i_size.  It must be zeroed out on each and every
-         * writepage invokation because it may be mmapped.  "A file is mapped
+         * writepage invocation because it may be mmapped.  "A file is mapped
         * in multiples of the page size.  For a file that is not a multiple of
         * the  page size, the remaining memory is zeroed when mapped, and
         * writes to that region are not written out to the file."
@@ -3265,7 +3265,7 @@ static void recalc_bh_state(void)
        
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
-        struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
        if (ret) {
                INIT_LIST_HEAD(&ret->b_assoc_buffers);
                get_cpu_var(bh_accounting).nr++;
@@ -3352,15 +3352,6 @@ int bh_submit_read(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(bh_submit_read);
-static void
-init_buffer_head(void *data)
-{
-        struct buffer_head *bh = data;
-        memset(bh, 0, sizeof(*bh));
-        INIT_LIST_HEAD(&bh->b_assoc_buffers);
-}
 void __init buffer_init(void)
 {
        int nrpages;
@@ -3369,7 +3360,7 @@ void __init buffer_init(void)
                        sizeof(struct buffer_head), 0,
                                (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
                                SLAB_MEM_SPREAD),
-                                init_buffer_head);
+                                NULL);
        /*
         * Limit the bh occupancy to 10% of ZONE_NORMAL
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 27089311fbea..37fe101a4e0d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
 * 2 of the Licence, or (at your option) any later version.
 */
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
 #include "internal.h"
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index f7c255f9c624..a8cd821226da 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -34,6 +34,7 @@ struct cachefiles_object {
        loff_t                          i_size;         /* object size */
        unsigned long                   flags;
 #define CACHEFILES_OBJECT_ACTIVE        0               /* T if marked active */
+#define CACHEFILES_OBJECT_BURIED        1               /* T if preemptively buried */
        atomic_t                        usage;          /* object usage count */
        uint8_t                         type;           /* object type */
        uint8_t                         new;            /* T if object new */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index eeb4986ea7db..f4a7840bf42c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -19,6 +19,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 #include "internal.h"
 #define CACHEFILES_KEYBUF_SIZE 512
@@ -92,6 +93,59 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object,
 }
 /*
+ * mark the owner of a dentry, if there is one, to indicate that that dentry
+ * has been preemptively deleted
+ * - the caller must hold the i_mutex on the dentry's parent as required to
+ *   call vfs_unlink(), vfs_rmdir() or vfs_rename()
+ */
+static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
+                                          struct dentry *dentry)
+{
+        struct cachefiles_object *object;
+        struct rb_node *p;
+        _enter(",'%*.*s'",
+               dentry->d_name.len, dentry->d_name.len, dentry->d_name.name);
+        write_lock(&cache->active_lock);
+        p = cache->active_nodes.rb_node;
+        while (p) {
+                object = rb_entry(p, struct cachefiles_object, active_node);
+                if (object->dentry > dentry)
+                        p = p->rb_left;
+                else if (object->dentry < dentry)
+                        p = p->rb_right;
+                else
+                        goto found_dentry;
+        }
+        write_unlock(&cache->active_lock);
+        _leave(" [no owner]");
+        return;
+        /* found the dentry for  */
+found_dentry:
+        kdebug("preemptive burial: OBJ%x [%s] %p",
+               object->fscache.debug_id,
+               fscache_object_states[object->fscache.state],
+               dentry);
+        if (object->fscache.state < FSCACHE_OBJECT_DYING) {
+                printk(KERN_ERR "\n");
+                printk(KERN_ERR "CacheFiles: Error:"
+                       " Can't preemptively bury live object\n");
+                cachefiles_printk_object(object, NULL);
+        } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
+                printk(KERN_ERR "CacheFiles: Error:"
+                       " Object already preemptively buried\n");
+        }
+        write_unlock(&cache->active_lock);
+        _leave(" [owner marked]");
+}
+/*
 * record the fact that an object is now active
 */
 static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
@@ -218,7 +272,8 @@ requeue:
 */
 static int cachefiles_bury_object(struct cachefiles_cache *cache,
                                  struct dentry *dir,
-                                  struct dentry *rep)
+                                  struct dentry *rep,
+                                  bool preemptive)
 {
        struct dentry *grave, *trap;
        char nbuffer[8 + 8 + 1];
@@ -228,11 +283,16 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
               dir->d_name.len, dir->d_name.len, dir->d_name.name,
               rep->d_name.len, rep->d_name.len, rep->d_name.name);
+        _debug("remove %p from %p", rep, dir);
        /* non-directories can just be unlinked */
        if (!S_ISDIR(rep->d_inode->i_mode)) {
                _debug("unlink stale object");
                ret = vfs_unlink(dir->d_inode, rep);
+                if (preemptive)
+                        cachefiles_mark_object_buried(cache, rep);
                mutex_unlock(&dir->d_inode->i_mutex);
                if (ret == -EIO)
@@ -324,6 +384,9 @@ try_again:
        if (ret != 0 && ret != -ENOMEM)
                cachefiles_io_error(cache, "Rename failed with error %d", ret);
+        if (preemptive)
+                cachefiles_mark_object_buried(cache, rep);
        unlock_rename(cache->graveyard, dir);
        dput(grave);
        _leave(" = 0");
@@ -339,7 +402,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
        struct dentry *dir;
        int ret;
-        _enter(",{%p}", object->dentry);
+        _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);
        ASSERT(object->dentry);
        ASSERT(object->dentry->d_inode);
@@ -349,15 +412,25 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-        /* we need to check that our parent is _still_ our parent - it may have
+        if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
-         * been renamed */
+                /* object allocation for the same key preemptively deleted this
-        if (dir == object->dentry->d_parent) {
+                 * object's file so that it could create its own file */
-                ret = cachefiles_bury_object(cache, dir, object->dentry);
+                _debug("object preemptively buried");
-        } else {
-                /* it got moved, presumably by cachefilesd culling it, so it's
-                 * no longer in the key path and we can ignore it */
                mutex_unlock(&dir->d_inode->i_mutex);
                ret = 0;
+        } else {
+                /* we need to check that our parent is _still_ our parent - it
+                 * may have been renamed */
+                if (dir == object->dentry->d_parent) {
+                        ret = cachefiles_bury_object(cache, dir,
+                                                     object->dentry, false);
+                } else {
+                        /* it got moved, presumably by cachefilesd culling it,
+                         * so it's no longer in the key path and we can ignore
+                         * it */
+                        mutex_unlock(&dir->d_inode->i_mutex);
+                        ret = 0;
+                }
        }
        dput(dir);
@@ -380,7 +453,9 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
        const char *name;
        int ret, nlen;
-        _enter("{%p},,%s,", parent->dentry, key);
+        _enter("OBJ%x{%p},OBJ%x,%s,",
+               parent->fscache.debug_id, parent->dentry,
+               object->fscache.debug_id, key);
        cache = container_of(parent->fscache.cache,
                             struct cachefiles_cache, cache);
@@ -508,7 +583,7 @@ lookup_again:
                         * mutex) */
                        object->dentry = NULL;
-                        ret = cachefiles_bury_object(cache, dir, next);
+                        ret = cachefiles_bury_object(cache, dir, next, true);
                        dput(next);
                        next = NULL;
@@ -827,7 +902,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
        /*  actually remove the victim (drops the dir mutex) */
        _debug("bury");
-        ret = cachefiles_bury_object(cache, dir, victim);
+        ret = cachefiles_bury_object(cache, dir, victim, false);
        if (ret < 0)
                goto error;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 1d8332563863..0f0d41fbb03f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -10,6 +10,7 @@
 */
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <linux/file.h>
 #include "internal.h"
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index b5808cdb2232..039b5011d83b 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -77,6 +77,8 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
 /*
 * check the security details of the on-disk cache
 * - must be called with security override in force
+ * - must return with a security override in force - even in the case of an
+ *   error
 */
 int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
                                        struct dentry *root,
@@ -99,6 +101,8 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
         * which create files */
        ret = set_create_files_as(new, root->d_inode);
        if (ret < 0) {
+                abort_creds(new);
+                cachefiles_begin_secure(cache, _saved_cred);
                _leave(" = %d [cfa]", ret);
                return ret;
        }
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index f3e7a0bf068b..e18b183b47e1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -16,6 +16,7 @@
 #include <linux/fsnotify.h>
 #include <linux/quotaops.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include "internal.h"
 static const char cachefiles_xattr_cache[] =
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..04b8280582a9
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
+config CEPH_FS
+        tristate "Ceph distributed file system (EXPERIMENTAL)"
+        depends on INET && EXPERIMENTAL
+        select LIBCRC32C
+        select CONFIG_CRYPTO_AES
+        help
+          Choose Y or M here to include support for mounting the
+          experimental Ceph distributed file system.  Ceph is an extremely
+          scalable file system designed to provide high performance,
+          reliable access to petabytes of storage.
+          More information at http://ceph.newdream.net/.
+          If unsure, say N.
+config CEPH_FS_PRETTYDEBUG
+        bool "Include file:line in ceph debug output"
+        depends on CEPH_FS
+        default n
+        help
+          If you say Y here, debug output will include a filename and
+          line to aid debugging.  This icnreases kernel size and slows
+          execution slightly when debug call sites are enabled (e.g.,
+          via CONFIG_DYNAMIC_DEBUG).
+          If unsure, say N.
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..6a660e610be8
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
+#
+# Makefile for CEPH filesystem.
+#
+ifneq ($(KERNELRELEASE),)
+obj-$(CONFIG_CEPH_FS) += ceph.o
+ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
+        export.o caps.o snap.o xattr.o \
+        messenger.o msgpool.o buffer.o pagelist.o \
+        mds_client.o mdsmap.o \
+        mon_client.o \
+        osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+        debugfs.o \
+        auth.o auth_none.o \
+        crypto.o armor.o \
+        auth_x.o \
+        ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+default: all
+all:
+        $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
+modules_install:
+        $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
+clean:
+        $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
+#
+# The following files are shared by (and manually synchronized
+# between) the Ceph userland and kernel client.
+#
+# userland                  kernel
+src/include/ceph_fs.h       fs/ceph/ceph_fs.h
+src/include/ceph_fs.cc      fs/ceph/ceph_fs.c
+src/include/msgr.h          fs/ceph/msgr.h
+src/include/rados.h         fs/ceph/rados.h
+src/include/ceph_strings.cc fs/ceph/ceph_strings.c
+src/include/ceph_frag.h     fs/ceph/ceph_frag.h
+src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
+src/include/ceph_hash.h     fs/ceph/ceph_hash.h
+src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
+src/crush/crush.c           fs/ceph/crush/crush.c
+src/crush/crush.h           fs/ceph/crush/crush.h
+src/crush/mapper.c          fs/ceph/crush/mapper.c
+src/crush/mapper.h          fs/ceph/crush/mapper.h
+src/crush/hash.h            fs/ceph/crush/hash.h
+src/crush/hash.c            fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..a9005d862ed4
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1187 @@
+#include "ceph_debug.h"
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>    /* generic_writepages */
+#include <linux/slab.h>
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+#include "super.h"
+#include "osd_client.h"
+/*
+ * Ceph address space ops.
+ *
+ * There are a few funny things going on here.
+ *
+ * The page->private field is used to reference a struct
+ * ceph_snap_context for _every_ dirty page.  This indicates which
+ * snapshot the page was logically dirtied in, and thus which snap
+ * context needs to be associated with the osd write during writeback.
+ *
+ * Similarly, struct ceph_inode_info maintains a set of counters to
+ * count dirty pages on the inode.  In the absense of snapshots,
+ * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
+ *
+ * When a snapshot is taken (that is, when the client receives
+ * notification that a snapshot was taken), each inode with caps and
+ * with dirty pages (dirty pages implies there is a cap) gets a new
+ * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
+ * order, new snaps go to the tail).  The i_wrbuffer_ref_head count is
+ * moved to capsnap->dirty. (Unless a sync write is currently in
+ * progress.  In that case, the capsnap is said to be "pending", new
+ * writes cannot start, and the capsnap isn't "finalized" until the
+ * write completes (or fails) and a final size/mtime for the inode for
+ * that snap can be settled upon.)  i_wrbuffer_ref_head is reset to 0.
+ *
+ * On writeback, we must submit writes to the osd IN SNAP ORDER.  So,
+ * we look for the first capsnap in i_cap_snaps and write out pages in
+ * that snap context _only_.  Then we move on to the next capsnap,
+ * eventually reaching the "live" or "head" context (i.e., pages that
+ * are not yet snapped) and are writing the most recently dirtied
+ * pages.
+ *
+ * Invalidate and so forth must take care to ensure the dirty page
+ * accounting is preserved.
+ */
+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
+#define CONGESTION_OFF_THRESH(congestion_kb)                            \
+        (CONGESTION_ON_THRESH(congestion_kb) -                          \
+         (CONGESTION_ON_THRESH(congestion_kb) >> 2))
+/*
+ * Dirty a page.  Optimistically adjust accounting, on the assumption
+ * that we won't race with invalidate.  If we do, readjust.
+ */
+static int ceph_set_page_dirty(struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        struct inode *inode;
+        struct ceph_inode_info *ci;
+        int undo = 0;
+        struct ceph_snap_context *snapc;
+        if (unlikely(!mapping))
+                return !TestSetPageDirty(page);
+        if (TestSetPageDirty(page)) {
+                dout("%p set_page_dirty %p idx %lu -- already dirty\n",
+                     mapping->host, page, page->index);
+                return 0;
+        }
+        inode = mapping->host;
+        ci = ceph_inode(inode);
+        /*
+         * Note that we're grabbing a snapc ref here without holding
+         * any locks!
+         */
+        snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+        /* dirty the head */
+        spin_lock(&inode->i_lock);
+        if (ci->i_wrbuffer_ref_head == 0)
+                ci->i_head_snapc = ceph_get_snap_context(snapc);
+        ++ci->i_wrbuffer_ref_head;
+        if (ci->i_wrbuffer_ref == 0)
+                igrab(inode);
+        ++ci->i_wrbuffer_ref;
+        dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+             "snapc %p seq %lld (%d snaps)\n",
+             mapping->host, page, page->index,
+             ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+             ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+             snapc, snapc->seq, snapc->num_snaps);
+        spin_unlock(&inode->i_lock);
+        /* now adjust page */
+        spin_lock_irq(&mapping->tree_lock);
+        if (page->mapping) {    /* Race with truncate? */
+                WARN_ON_ONCE(!PageUptodate(page));
+                if (mapping_cap_account_dirty(mapping)) {
+                        __inc_zone_page_state(page, NR_FILE_DIRTY);
+                        __inc_bdi_stat(mapping->backing_dev_info,
+                                        BDI_RECLAIMABLE);
+                        task_io_account_write(PAGE_CACHE_SIZE);
+                }
+                radix_tree_tag_set(&mapping->page_tree,
+                                page_index(page), PAGECACHE_TAG_DIRTY);
+                /*
+                 * Reference snap context in page->private.  Also set
+                 * PagePrivate so that we get invalidatepage callback.
+                 */
+                page->private = (unsigned long)snapc;
+                SetPagePrivate(page);
+        } else {
+                dout("ANON set_page_dirty %p (raced truncate?)\n", page);
+                undo = 1;
+        }
+        spin_unlock_irq(&mapping->tree_lock);
+        if (undo)
+                /* whoops, we failed to dirty the page */
+                ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+        BUG_ON(!PageDirty(page));
+        return 1;
+}
+/*
+ * If we are truncating the full page (i.e. offset == 0), adjust the
+ * dirty page counters appropriately.  Only called if there is private
+ * data on the page.
+ */
+static void ceph_invalidatepage(struct page *page, unsigned long offset)
+{
+        struct inode *inode;
+        struct ceph_inode_info *ci;
+        struct ceph_snap_context *snapc = (void *)page->private;
+        BUG_ON(!PageLocked(page));
+        BUG_ON(!page->private);
+        BUG_ON(!PagePrivate(page));
+        BUG_ON(!page->mapping);
+        inode = page->mapping->host;
+        /*
+         * We can get non-dirty pages here due to races between
+         * set_page_dirty and truncate_complete_page; just spit out a
+         * warning, in case we end up with accounting problems later.
+         */
+        if (!PageDirty(page))
+                pr_err("%p invalidatepage %p page not dirty\n", inode, page);
+        if (offset == 0)
+                ClearPageChecked(page);
+        ci = ceph_inode(inode);
+        if (offset == 0) {
+                dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
+                     inode, page, page->index, offset);
+                ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+                ceph_put_snap_context(snapc);
+                page->private = 0;
+                ClearPagePrivate(page);
+        } else {
+                dout("%p invalidatepage %p idx %lu partial dirty page\n",
+                     inode, page, page->index);
+        }
+}
+/* just a sanity check */
+static int ceph_releasepage(struct page *page, gfp_t g)
+{
+        struct inode *inode = page->mapping ? page->mapping->host : NULL;
+        dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+        WARN_ON(PageDirty(page));
+        WARN_ON(page->private);
+        WARN_ON(PagePrivate(page));
+        return 0;
+}
+/*
+ * read a single page, without unlocking it.
+ */
+static int readpage_nounlock(struct file *filp, struct page *page)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+        int err = 0;
+        u64 len = PAGE_CACHE_SIZE;
+        dout("readpage inode %p file %p page %p index %lu\n",
+             inode, filp, page, page->index);
+        err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+                                  page->index << PAGE_CACHE_SHIFT, &len,
+                                  ci->i_truncate_seq, ci->i_truncate_size,
+                                  &page, 1);
+        if (err == -ENOENT)
+                err = 0;
+        if (err < 0) {
+                SetPageError(page);
+                goto out;
+        } else if (err < PAGE_CACHE_SIZE) {
+                /* zero fill remainder of page */
+                zero_user_segment(page, err, PAGE_CACHE_SIZE);
+        }
+        SetPageUptodate(page);
+out:
+        return err < 0 ? err : 0;
+}
+static int ceph_readpage(struct file *filp, struct page *page)
+{
+        int r = readpage_nounlock(filp, page);
+        unlock_page(page);
+        return r;
+}
+/*
+ * Build a vector of contiguous pages from the provided page list.
+ */
+static struct page **page_vector_from_list(struct list_head *page_list,
+                                           unsigned *nr_pages)
+{
+        struct page **pages;
+        struct page *page;
+        int next_index, contig_pages = 0;
+        /* build page vector */
+        pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
+        if (!pages)
+                return ERR_PTR(-ENOMEM);
+        BUG_ON(list_empty(page_list));
+        next_index = list_entry(page_list->prev, struct page, lru)->index;
+        list_for_each_entry_reverse(page, page_list, lru) {
+                if (page->index == next_index) {
+                        dout("readpages page %d %p\n", contig_pages, page);
+                        pages[contig_pages] = page;
+                        contig_pages++;
+                        next_index++;
+                } else {
+                        break;
+                }
+        }
+        *nr_pages = contig_pages;
+        return pages;
+}
+/*
+ * Read multiple pages.  Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+                          struct list_head *page_list, unsigned nr_pages)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+        int rc = 0;
+        struct page **pages;
+        struct pagevec pvec;
+        loff_t offset;
+        u64 len;
+        dout("readpages %p file %p nr_pages %d\n",
+             inode, file, nr_pages);
+        pages = page_vector_from_list(page_list, &nr_pages);
+        if (IS_ERR(pages))
+                return PTR_ERR(pages);
+        /* guess read extent */
+        offset = pages[0]->index << PAGE_CACHE_SHIFT;
+        len = nr_pages << PAGE_CACHE_SHIFT;
+        rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+                                 offset, &len,
+                                 ci->i_truncate_seq, ci->i_truncate_size,
+                                 pages, nr_pages);
+        if (rc == -ENOENT)
+                rc = 0;
+        if (rc < 0)
+                goto out;
+        /* set uptodate and add to lru in pagevec-sized chunks */
+        pagevec_init(&pvec, 0);
+        for (; !list_empty(page_list) && len > 0;
+             rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
+                struct page *page =
+                        list_entry(page_list->prev, struct page, lru);
+                list_del(&page->lru);
+                if (rc < (int)PAGE_CACHE_SIZE) {
+                        /* zero (remainder of) page */
+                        int s = rc < 0 ? 0 : rc;
+                        zero_user_segment(page, s, PAGE_CACHE_SIZE);
+                }
+                if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+                        page_cache_release(page);
+                        dout("readpages %p add_to_page_cache failed %p\n",
+                             inode, page);
+                        continue;
+                }
+                dout("readpages %p adding %p idx %lu\n", inode, page,
+                     page->index);
+                flush_dcache_page(page);
+                SetPageUptodate(page);
+                unlock_page(page);
+                if (pagevec_add(&pvec, page) == 0)
+                        pagevec_lru_add_file(&pvec);   /* add to lru */
+        }
+        pagevec_lru_add_file(&pvec);
+        rc = 0;
+out:
+        kfree(pages);
+        return rc;
+}
+/*
+ * Get ref for the oldest snapc for an inode with dirty data... that is, the
+ * only snap context we are allowed to write back.
+ */
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
+                                                    u64 *snap_size)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_snap_context *snapc = NULL;
+        struct ceph_cap_snap *capsnap = NULL;
+        spin_lock(&inode->i_lock);
+        list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+                dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
+                     capsnap->context, capsnap->dirty_pages);
+                if (capsnap->dirty_pages) {
+                        snapc = ceph_get_snap_context(capsnap->context);
+                        if (snap_size)
+                                *snap_size = capsnap->size;
+                        break;
+                }
+        }
+        if (!snapc && ci->i_head_snapc) {
+                snapc = ceph_get_snap_context(ci->i_head_snapc);
+                dout(" head snapc %p has %d dirty pages\n",
+                     snapc, ci->i_wrbuffer_ref_head);
+        }
+        spin_unlock(&inode->i_lock);
+        return snapc;
+}
+/*
+ * Write a single page, but leave the page locked.
+ *
+ * If we get a write error, set the page error bit, but still adjust the
+ * dirty page accounting (i.e., page is no longer dirty).
+ */
+static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+{
+        struct inode *inode;
+        struct ceph_inode_info *ci;
+        struct ceph_client *client;
+        struct ceph_osd_client *osdc;
+        loff_t page_off = page->index << PAGE_CACHE_SHIFT;
+        int len = PAGE_CACHE_SIZE;
+        loff_t i_size;
+        int err = 0;
+        struct ceph_snap_context *snapc, *oldest;
+        u64 snap_size = 0;
+        long writeback_stat;
+        dout("writepage %p idx %lu\n", page, page->index);
+        if (!page->mapping || !page->mapping->host) {
+                dout("writepage %p - no mapping\n", page);
+                return -EFAULT;
+        }
+        inode = page->mapping->host;
+        ci = ceph_inode(inode);
+        client = ceph_inode_to_client(inode);
+        osdc = &client->osdc;
+        /* verify this is a writeable snap context */
+        snapc = (void *)page->private;
+        if (snapc == NULL) {
+                dout("writepage %p page %p not dirty?\n", inode, page);
+                goto out;
+        }
+        oldest = get_oldest_context(inode, &snap_size);
+        if (snapc->seq > oldest->seq) {
+                dout("writepage %p page %p snapc %p not writeable - noop\n",
+                     inode, page, (void *)page->private);
+                /* we should only noop if called by kswapd */
+                WARN_ON((current->flags & PF_MEMALLOC) == 0);
+                ceph_put_snap_context(oldest);
+                goto out;
+        }
+        ceph_put_snap_context(oldest);
+        /* is this a partial page at end of file? */
+        if (snap_size)
+                i_size = snap_size;
+        else
+                i_size = i_size_read(inode);
+        if (i_size < page_off + len)
+                len = i_size - page_off;
+        dout("writepage %p page %p index %lu on %llu~%u\n",
+             inode, page, page->index, page_off, len);
+        writeback_stat = atomic_long_inc_return(&client->writeback_count);
+        if (writeback_stat >
+            CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
+                set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+        set_page_writeback(page);
+        err = ceph_osdc_writepages(osdc, ceph_vino(inode),
+                                   &ci->i_layout, snapc,
+                                   page_off, len,
+                                   ci->i_truncate_seq, ci->i_truncate_size,
+                                   &inode->i_mtime,
+                                   &page, 1, 0, 0, true);
+        if (err < 0) {
+                dout("writepage setting page/mapping error %d %p\n", err, page);
+                SetPageError(page);
+                mapping_set_error(&inode->i_data, err);
+                if (wbc)
+                        wbc->pages_skipped++;
+        } else {
+                dout("writepage cleaned page %p\n", page);
+                err = 0;  /* vfs expects us to return 0 */
+        }
+        page->private = 0;
+        ClearPagePrivate(page);
+        end_page_writeback(page);
+        ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+        ceph_put_snap_context(snapc);  /* page's reference */
+out:
+        return err;
+}
+static int ceph_writepage(struct page *page, struct writeback_control *wbc)
+{
+        int err;
+        struct inode *inode = page->mapping->host;
+        BUG_ON(!inode);
+        igrab(inode);
+        err = writepage_nounlock(page, wbc);
+        unlock_page(page);
+        iput(inode);
+        return err;
+}
+/*
+ * lame release_pages helper.  release_pages() isn't exported to
+ * modules.
+ */
+static void ceph_release_pages(struct page **pages, int num)
+{
+        struct pagevec pvec;
+        int i;
+        pagevec_init(&pvec, 0);
+        for (i = 0; i < num; i++) {
+                if (pagevec_add(&pvec, pages[i]) == 0)
+                        pagevec_release(&pvec);
+        }
+        pagevec_release(&pvec);
+}
+/*
+ * async writeback completion handler.
+ *
+ * If we get an error, set the mapping error bit, but not the individual
+ * page error bits.
+ */
+static void writepages_finish(struct ceph_osd_request *req,
+                              struct ceph_msg *msg)
+{
+        struct inode *inode = req->r_inode;
+        struct ceph_osd_reply_head *replyhead;
+        struct ceph_osd_op *op;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        unsigned wrote;
+        struct page *page;
+        int i;
+        struct ceph_snap_context *snapc = req->r_snapc;
+        struct address_space *mapping = inode->i_mapping;
+        __s32 rc = -EIO;
+        u64 bytes = 0;
+        struct ceph_client *client = ceph_inode_to_client(inode);
+        long writeback_stat;
+        unsigned issued = ceph_caps_issued(ci);
+        /* parse reply */
+        replyhead = msg->front.iov_base;
+        WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+        op = (void *)(replyhead + 1);
+        rc = le32_to_cpu(replyhead->result);
+        bytes = le64_to_cpu(op->extent.length);
+        if (rc >= 0) {
+                /*
+                 * Assume we wrote the pages we originally sent.  The
+                 * osd might reply with fewer pages if our writeback
+                 * raced with a truncation and was adjusted at the osd,
+                 * so don't believe the reply.
+                 */
+                wrote = req->r_num_pages;
+        } else {
+                wrote = 0;
+                mapping_set_error(mapping, rc);
+        }
+        dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
+             inode, rc, bytes, wrote);
+        /* clean all pages */
+        for (i = 0; i < req->r_num_pages; i++) {
+                page = req->r_pages[i];
+                BUG_ON(!page);
+                WARN_ON(!PageUptodate(page));
+                writeback_stat =
+                        atomic_long_dec_return(&client->writeback_count);
+                if (writeback_stat <
+                    CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
+                        clear_bdi_congested(&client->backing_dev_info,
+                                            BLK_RW_ASYNC);
+                ceph_put_snap_context((void *)page->private);
+                page->private = 0;
+                ClearPagePrivate(page);
+                dout("unlocking %d %p\n", i, page);
+                end_page_writeback(page);
+                /*
+                 * We lost the cache cap, need to truncate the page before
+                 * it is unlocked, otherwise we'd truncate it later in the
+                 * page truncation thread, possibly losing some data that
+                 * raced its way in
+                 */
+                if ((issued & CEPH_CAP_FILE_CACHE) == 0)
+                        generic_error_remove_page(inode->i_mapping, page);
+                unlock_page(page);
+        }
+        dout("%p wrote+cleaned %d pages\n", inode, wrote);
+        ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+        ceph_release_pages(req->r_pages, req->r_num_pages);
+        if (req->r_pages_from_pool)
+                mempool_free(req->r_pages,
+                             ceph_client(inode->i_sb)->wb_pagevec_pool);
+        else
+                kfree(req->r_pages);
+        ceph_osdc_put_request(req);
+}
+/*
+ * allocate a page vec, either directly, or if necessary, via a the
+ * mempool.  we avoid the mempool if we can because req->r_num_pages
+ * may be less than the maximum write size.
+ */
+static void alloc_page_vec(struct ceph_client *client,
+                           struct ceph_osd_request *req)
+{
+        req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
+                               GFP_NOFS);
+        if (!req->r_pages) {
+                req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+                req->r_pages_from_pool = 1;
+                WARN_ON(!req->r_pages);
+        }
+}
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+                                 struct writeback_control *wbc)
+{
+        struct inode *inode = mapping->host;
+        struct backing_dev_info *bdi = mapping->backing_dev_info;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_client *client;
+        pgoff_t index, start, end;
+        int range_whole = 0;
+        int should_loop = 1;
+        pgoff_t max_pages = 0, max_pages_ever = 0;
+        struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
+        struct pagevec pvec;
+        int done = 0;
+        int rc = 0;
+        unsigned wsize = 1 << inode->i_blkbits;
+        struct ceph_osd_request *req = NULL;
+        int do_sync;
+        u64 snap_size = 0;
+        /*
+         * Include a 'sync' in the OSD request if this is a data
+         * integrity write (e.g., O_SYNC write or fsync()), or if our
+         * cap is being revoked.
+         */
+        do_sync = wbc->sync_mode == WB_SYNC_ALL;
+        if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+                do_sync = 1;
+        dout("writepages_start %p dosync=%d (mode=%s)\n",
+             inode, do_sync,
+             wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+        client = ceph_inode_to_client(inode);
+        if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+                pr_warning("writepage_start %p on forced umount\n", inode);
+                return -EIO; /* we're in a forced umount, don't write! */
+        }
+        if (client->mount_args->wsize && client->mount_args->wsize < wsize)
+                wsize = client->mount_args->wsize;
+        if (wsize < PAGE_CACHE_SIZE)
+                wsize = PAGE_CACHE_SIZE;
+        max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
+        pagevec_init(&pvec, 0);
+        /* ?? */
+        if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                dout(" writepages congested\n");
+                wbc->encountered_congestion = 1;
+                goto out_final;
+        }
+        /* where to start/end? */
+        if (wbc->range_cyclic) {
+                start = mapping->writeback_index; /* Start from prev offset */
+                end = -1;
+                dout(" cyclic, start at %lu\n", start);
+        } else {
+                start = wbc->range_start >> PAGE_CACHE_SHIFT;
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+                        range_whole = 1;
+                should_loop = 0;
+                dout(" not cyclic, %lu to %lu\n", start, end);
+        }
+        index = start;
+retry:
+        /* find oldest snap context with dirty data */
+        ceph_put_snap_context(snapc);
+        snapc = get_oldest_context(inode, &snap_size);
+        if (!snapc) {
+                /* hmm, why does writepages get called when there
+                   is no dirty data? */
+                dout(" no snap context with dirty data?\n");
+                goto out;
+        }
+        dout(" oldest snapc is %p seq %lld (%d snaps)\n",
+             snapc, snapc->seq, snapc->num_snaps);
+        if (last_snapc && snapc != last_snapc) {
+                /* if we switched to a newer snapc, restart our scan at the
+                 * start of the original file range. */
+                dout("  snapc differs from last pass, restarting at %lu\n",
+                     index);
+                index = start;
+        }
+        last_snapc = snapc;
+        while (!done && index <= end) {
+                unsigned i;
+                int first;
+                pgoff_t next;
+                int pvec_pages, locked_pages;
+                struct page *page;
+                int want;
+                u64 offset, len;
+                struct ceph_osd_request_head *reqhead;
+                struct ceph_osd_op *op;
+                long writeback_stat;
+                next = 0;
+                locked_pages = 0;
+                max_pages = max_pages_ever;
+get_more_pages:
+                first = -1;
+                want = min(end - index,
+                           min((pgoff_t)PAGEVEC_SIZE,
+                               max_pages - (pgoff_t)locked_pages) - 1)
+                        + 1;
+                pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                                PAGECACHE_TAG_DIRTY,
+                                                want);
+                dout("pagevec_lookup_tag got %d\n", pvec_pages);
+                if (!pvec_pages && !locked_pages)
+                        break;
+                for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
+                        page = pvec.pages[i];
+                        dout("? %p idx %lu\n", page, page->index);
+                        if (locked_pages == 0)
+                                lock_page(page);  /* first page */
+                        else if (!trylock_page(page))
+                                break;
+                        /* only dirty pages, or our accounting breaks */
+                        if (unlikely(!PageDirty(page)) ||
+                            unlikely(page->mapping != mapping)) {
+                                dout("!dirty or !mapping %p\n", page);
+                                unlock_page(page);
+                                break;
+                        }
+                        if (!wbc->range_cyclic && page->index > end) {
+                                dout("end of range %p\n", page);
+                                done = 1;
+                                unlock_page(page);
+                                break;
+                        }
+                        if (next && (page->index != next)) {
+                                dout("not consecutive %p\n", page);
+                                unlock_page(page);
+                                break;
+                        }
+                        if (wbc->sync_mode != WB_SYNC_NONE) {
+                                dout("waiting on writeback %p\n", page);
+                                wait_on_page_writeback(page);
+                        }
+                        if ((snap_size && page_offset(page) > snap_size) ||
+                            (!snap_size &&
+                             page_offset(page) > i_size_read(inode))) {
+                                dout("%p page eof %llu\n", page, snap_size ?
+                                     snap_size : i_size_read(inode));
+                                done = 1;
+                                unlock_page(page);
+                                break;
+                        }
+                        if (PageWriteback(page)) {
+                                dout("%p under writeback\n", page);
+                                unlock_page(page);
+                                break;
+                        }
+                        /* only if matching snap context */
+                        pgsnapc = (void *)page->private;
+                        if (pgsnapc->seq > snapc->seq) {
+                                dout("page snapc %p %lld > oldest %p %lld\n",
+                                     pgsnapc, pgsnapc->seq, snapc, snapc->seq);
+                                unlock_page(page);
+                                if (!locked_pages)
+                                        continue; /* keep looking for snap */
+                                break;
+                        }
+                        if (!clear_page_dirty_for_io(page)) {
+                                dout("%p !clear_page_dirty_for_io\n", page);
+                                unlock_page(page);
+                                break;
+                        }
+                        /* ok */
+                        if (locked_pages == 0) {
+                                /* prepare async write request */
+                                offset = page->index << PAGE_CACHE_SHIFT;
+                                len = wsize;
+                                req = ceph_osdc_new_request(&client->osdc,
+                                            &ci->i_layout,
+                                            ceph_vino(inode),
+                                            offset, &len,
+                                            CEPH_OSD_OP_WRITE,
+                                            CEPH_OSD_FLAG_WRITE |
+                                                    CEPH_OSD_FLAG_ONDISK,
+                                            snapc, do_sync,
+                                            ci->i_truncate_seq,
+                                            ci->i_truncate_size,
+                                            &inode->i_mtime, true, 1);
+                                max_pages = req->r_num_pages;
+                                alloc_page_vec(client, req);
+                                req->r_callback = writepages_finish;
+                                req->r_inode = inode;
+                        }
+                        /* note position of first page in pvec */
+                        if (first < 0)
+                                first = i;
+                        dout("%p will write page %p idx %lu\n",
+                             inode, page, page->index);
+                        writeback_stat = atomic_long_inc_return(&client->writeback_count);
+                        if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
+                                set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+                        }
+                        set_page_writeback(page);
+                        req->r_pages[locked_pages] = page;
+                        locked_pages++;
+                        next = page->index + 1;
+                }
+                /* did we get anything? */
+                if (!locked_pages)
+                        goto release_pvec_pages;
+                if (i) {
+                        int j;
+                        BUG_ON(!locked_pages || first < 0);
+                        if (pvec_pages && i == pvec_pages &&
+                            locked_pages < max_pages) {
+                                dout("reached end pvec, trying for more\n");
+                                pagevec_reinit(&pvec);
+                                goto get_more_pages;
+                        }
+                        /* shift unused pages over in the pvec...  we
+                         * will need to release them below. */
+                        for (j = i; j < pvec_pages; j++) {
+                                dout(" pvec leftover page %p\n",
+                                     pvec.pages[j]);
+                                pvec.pages[j-i+first] = pvec.pages[j];
+                        }
+                        pvec.nr -= i-first;
+                }
+                /* submit the write */
+                offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
+                len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
+                          (u64)locked_pages << PAGE_CACHE_SHIFT);
+                dout("writepages got %d pages at %llu~%llu\n",
+                     locked_pages, offset, len);
+                /* revise final length, page count */
+                req->r_num_pages = locked_pages;
+                reqhead = req->r_request->front.iov_base;
+                op = (void *)(reqhead + 1);
+                op->extent.length = cpu_to_le64(len);
+                op->payload_len = cpu_to_le32(len);
+                req->r_request->hdr.data_len = cpu_to_le32(len);
+                ceph_osdc_start_request(&client->osdc, req, true);
+                req = NULL;
+                /* continue? */
+                index = next;
+                wbc->nr_to_write -= locked_pages;
+                if (wbc->nr_to_write <= 0)
+                        done = 1;
+release_pvec_pages:
+                dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
+                     pvec.nr ? pvec.pages[0] : NULL);
+                pagevec_release(&pvec);
+                if (locked_pages && !done)
+                        goto retry;
+        }
+        if (should_loop && !done) {
+                /* more to do; loop back to beginning of file */
+                dout("writepages looping back to beginning of file\n");
+                should_loop = 0;
+                index = 0;
+                goto retry;
+        }
+        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+                mapping->writeback_index = index;
+out:
+        if (req)
+                ceph_osdc_put_request(req);
+        if (rc > 0)
+                rc = 0;  /* vfs expects us to return 0 */
+        ceph_put_snap_context(snapc);
+        dout("writepages done, rc = %d\n", rc);
+out_final:
+        return rc;
+}
+/*
+ * See if a given @snapc is either writeable, or already written.
+ */
+static int context_is_writeable_or_written(struct inode *inode,
+                                           struct ceph_snap_context *snapc)
+{
+        struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+        int ret = !oldest || snapc->seq <= oldest->seq;
+        ceph_put_snap_context(oldest);
+        return ret;
+}
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ *
+ * called with page locked.
+ * return success with page locked,
+ * or any failure (incl -EAGAIN) with page unlocked.
+ */
+static int ceph_update_writeable_page(struct file *file,
+                            loff_t pos, unsigned len,
+                            struct page *page)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        loff_t page_off = pos & PAGE_CACHE_MASK;
+        int pos_in_page = pos & ~PAGE_CACHE_MASK;
+        int end_in_page = pos_in_page + len;
+        loff_t i_size;
+        int r;
+        struct ceph_snap_context *snapc, *oldest;
+retry_locked:
+        /* writepages currently holds page lock, but if we change that later, */
+        wait_on_page_writeback(page);
+        /* check snap context */
+        BUG_ON(!ci->i_snap_realm);
+        down_read(&mdsc->snap_rwsem);
+        BUG_ON(!ci->i_snap_realm->cached_context);
+        snapc = (void *)page->private;
+        if (snapc && snapc != ci->i_head_snapc) {
+                /*
+                 * this page is already dirty in another (older) snap
+                 * context!  is it writeable now?
+                 */
+                oldest = get_oldest_context(inode, NULL);
+                up_read(&mdsc->snap_rwsem);
+                if (snapc->seq > oldest->seq) {
+                        ceph_put_snap_context(oldest);
+                        dout(" page %p snapc %p not current or oldest\n",
+                             page, snapc);
+                        /*
+                         * queue for writeback, and wait for snapc to
+                         * be writeable or written
+                         */
+                        snapc = ceph_get_snap_context(snapc);
+                        unlock_page(page);
+                        ceph_queue_writeback(inode);
+                        r = wait_event_interruptible(ci->i_cap_wq,
+                               context_is_writeable_or_written(inode, snapc));
+                        ceph_put_snap_context(snapc);
+                        if (r == -ERESTARTSYS)
+                                return r;
+                        return -EAGAIN;
+                }
+                ceph_put_snap_context(oldest);
+                /* yay, writeable, do it now (without dropping page lock) */
+                dout(" page %p snapc %p not current, but oldest\n",
+                     page, snapc);
+                if (!clear_page_dirty_for_io(page))
+                        goto retry_locked;
+                r = writepage_nounlock(page, NULL);
+                if (r < 0)
+                        goto fail_nosnap;
+                goto retry_locked;
+        }
+        if (PageUptodate(page)) {
+                dout(" page %p already uptodate\n", page);
+                return 0;
+        }
+        /* full page? */
+        if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
+                return 0;
+        /* past end of file? */
+        i_size = inode->i_size;   /* caller holds i_mutex */
+        if (i_size + len > inode->i_sb->s_maxbytes) {
+                /* file is too big */
+                r = -EINVAL;
+                goto fail;
+        }
+        if (page_off >= i_size ||
+            (pos_in_page == 0 && (pos+len) >= i_size &&
+             end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
+                dout(" zeroing %p 0 - %d and %d - %d\n",
+                     page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
+                zero_user_segments(page,
+                                   0, pos_in_page,
+                                   end_in_page, PAGE_CACHE_SIZE);
+                return 0;
+        }
+        /* we need to read it. */
+        up_read(&mdsc->snap_rwsem);
+        r = readpage_nounlock(file, page);
+        if (r < 0)
+                goto fail_nosnap;
+        goto retry_locked;
+fail:
+        up_read(&mdsc->snap_rwsem);
+fail_nosnap:
+        unlock_page(page);
+        return r;
+}
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_write_begin(struct file *file, struct address_space *mapping,
+                            loff_t pos, unsigned len, unsigned flags,
+                            struct page **pagep, void **fsdata)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct page *page;
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+        int r;
+        do {
+                /* get a page */
+                page = grab_cache_page_write_begin(mapping, index, 0);
+                if (!page)
+                        return -ENOMEM;
+                *pagep = page;
+                dout("write_begin file %p inode %p page %p %d~%d\n", file,
+                inode, page, (int)pos, (int)len);
+                r = ceph_update_writeable_page(file, pos, len, page);
+        } while (r == -EAGAIN);
+        return r;
+}
+/*
+ * we don't do anything in here that simple_write_end doesn't do
+ * except adjust dirty page accounting and drop read lock on
+ * mdsc->snap_rwsem.
+ */
+static int ceph_write_end(struct file *file, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+        int check_cap = 0;
+        dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
+             inode, page, (int)pos, (int)copied, (int)len);
+        /* zero the stale part of the page if we did a short copy */
+        if (copied < len)
+                zero_user_segment(page, from+copied, len);
+        /* did file size increase? */
+        /* (no need for i_size_read(); we caller holds i_mutex */
+        if (pos+copied > inode->i_size)
+                check_cap = ceph_inode_set_size(inode, pos+copied);
+        if (!PageUptodate(page))
+                SetPageUptodate(page);
+        set_page_dirty(page);
+        unlock_page(page);
+        up_read(&mdsc->snap_rwsem);
+        page_cache_release(page);
+        if (check_cap)
+                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+        return copied;
+}
+/*
+ * we set .direct_IO to indicate direct io is supported, but since we
+ * intercept O_DIRECT reads and writes early, this function should
+ * never get called.
+ */
+static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
+                              const struct iovec *iov,
+                              loff_t pos, unsigned long nr_segs)
+{
+        WARN_ON(1);
+        return -EINVAL;
+}
+const struct address_space_operations ceph_aops = {
+        .readpage = ceph_readpage,
+        .readpages = ceph_readpages,
+        .writepage = ceph_writepage,
+        .writepages = ceph_writepages_start,
+        .write_begin = ceph_write_begin,
+        .write_end = ceph_write_end,
+        .set_page_dirty = ceph_set_page_dirty,
+        .invalidatepage = ceph_invalidatepage,
+        .releasepage = ceph_releasepage,
+        .direct_IO = ceph_direct_io,
+};
+/*
+ * vm ops
+ */
+/*
+ * Reuse write_begin here for simplicity.
+ */
+static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct inode *inode = vma->vm_file->f_dentry->d_inode;
+        struct page *page = vmf->page;
+        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        loff_t off = page->index << PAGE_CACHE_SHIFT;
+        loff_t size, len;
+        int ret;
+        size = i_size_read(inode);
+        if (off + PAGE_CACHE_SIZE <= size)
+                len = PAGE_CACHE_SIZE;
+        else
+                len = size & ~PAGE_CACHE_MASK;
+        dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
+             off, len, page, page->index);
+        lock_page(page);
+        ret = VM_FAULT_NOPAGE;
+        if ((off > size) ||
+            (page->mapping != inode->i_mapping))
+                goto out;
+        ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+        if (ret == 0) {
+                /* success.  we'll keep the page locked. */
+                set_page_dirty(page);
+                up_read(&mdsc->snap_rwsem);
+                ret = VM_FAULT_LOCKED;
+        } else {
+                if (ret == -ENOMEM)
+                        ret = VM_FAULT_OOM;
+                else
+                        ret = VM_FAULT_SIGBUS;
+        }
+out:
+        dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
+        if (ret != VM_FAULT_LOCKED)
+                unlock_page(page);
+        return ret;
+}
+static struct vm_operations_struct ceph_vmops = {
+        .fault          = filemap_fault,
+        .page_mkwrite   = ceph_page_mkwrite,
+};
+int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        struct address_space *mapping = file->f_mapping;
+        if (!mapping->a_ops->readpage)
+                return -ENOEXEC;
+        file_accessed(file);
+        vma->vm_ops = &ceph_vmops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
+        return 0;
+}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
+#include <linux/errno.h>
+/*
+ * base64 encode/decode.
+ */
+const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static int encode_bits(int c)
+{
+        return pem_key[c];
+}
+static int decode_bits(char c)
+{
+        if (c >= 'A' && c <= 'Z')
+                return c - 'A';
+        if (c >= 'a' && c <= 'z')
+                return c - 'a' + 26;
+        if (c >= '0' && c <= '9')
+                return c - '0' + 52;
+        if (c == '+')
+                return 62;
+        if (c == '/')
+                return 63;
+        if (c == '=')
+                return 0; /* just non-negative, please */
+        return -EINVAL;
+}
+int ceph_armor(char *dst, const char *src, const char *end)
+{
+        int olen = 0;
+        int line = 0;
+        while (src < end) {
+                unsigned char a, b, c;
+                a = *src++;
+                *dst++ = encode_bits(a >> 2);
+                if (src < end) {
+                        b = *src++;
+                        *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+                        if (src < end) {
+                                c = *src++;
+                                *dst++ = encode_bits(((b & 15) << 2) |
+                                                     (c >> 6));
+                                *dst++ = encode_bits(c & 63);
+                        } else {
+                                *dst++ = encode_bits((b & 15) << 2);
+                                *dst++ = '=';
+                        }
+                } else {
+                        *dst++ = encode_bits(((a & 3) << 4));
+                        *dst++ = '=';
+                        *dst++ = '=';
+                }
+                olen += 4;
+                line += 4;
+                if (line == 64) {
+                        line = 0;
+                        *(dst++) = '\n';
+                        olen++;
+                }
+        }
+        return olen;
+}
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+        int olen = 0;
+        while (src < end) {
+                int a, b, c, d;
+                if (src < end && src[0] == '\n')
+                        src++;
+                if (src + 4 > end)
+                        return -EINVAL;
+                a = decode_bits(src[0]);
+                b = decode_bits(src[1]);
+                c = decode_bits(src[2]);
+                d = decode_bits(src[3]);
+                if (a < 0 || b < 0 || c < 0 || d < 0)
+                        return -EINVAL;
+                *dst++ = (a << 2) | (b >> 4);
+                if (src[2] == '=')
+                        return olen + 1;
+                *dst++ = ((b & 15) << 4) | (c >> 2);
+                if (src[3] == '=')
+                        return olen + 2;
+                *dst++ = ((c & 3) << 6) | d;
+                olen += 3;
+                src += 4;
+        }
+        return olen;
+}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..818afe72e6c7
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,259 @@
+#include "ceph_debug.h"
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include "types.h"
+#include "auth_none.h"
+#include "auth_x.h"
+#include "decode.h"
+#include "super.h"
+#include "messenger.h"
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+        CEPH_AUTH_NONE,
+        CEPH_AUTH_CEPHX
+};
+int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+{
+        switch (protocol) {
+        case CEPH_AUTH_NONE:
+                return ceph_auth_none_init(ac);
+        case CEPH_AUTH_CEPHX:
+                return ceph_x_init(ac);
+        default:
+                return -ENOENT;
+        }
+}
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
+{
+        struct ceph_auth_client *ac;
+        int ret;
+        dout("auth_init name '%s' secret '%s'\n", name, secret);
+        ret = -ENOMEM;
+        ac = kzalloc(sizeof(*ac), GFP_NOFS);
+        if (!ac)
+                goto out;
+        ac->negotiating = true;
+        if (name)
+                ac->name = name;
+        else
+                ac->name = CEPH_AUTH_NAME_DEFAULT;
+        dout("auth_init name %s secret %s\n", ac->name, secret);
+        ac->secret = secret;
+        return ac;
+out:
+        return ERR_PTR(ret);
+}
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+        dout("auth_destroy %p\n", ac);
+        if (ac->ops)
+                ac->ops->destroy(ac);
+        kfree(ac);
+}
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+        dout("auth_reset %p\n", ac);
+        if (ac->ops && !ac->negotiating)
+                ac->ops->reset(ac);
+        ac->negotiating = true;
+}
+int ceph_entity_name_encode(const char *name, void **p, void *end)
+{
+        int len = strlen(name);
+        if (*p + 2*sizeof(u32) + len > end)
+                return -ERANGE;
+        ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+        ceph_encode_32(p, len);
+        ceph_encode_copy(p, name, len);
+        return 0;
+}
+/*
+ * Initiate protocol negotiation with monitor.  Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+        struct ceph_mon_request_header *monhdr = buf;
+        void *p = monhdr + 1, *end = buf + len, *lenp;
+        int i, num;
+        int ret;
+        dout("auth_build_hello\n");
+        monhdr->have_version = 0;
+        monhdr->session_mon = cpu_to_le16(-1);
+        monhdr->session_mon_tid = 0;
+        ceph_encode_32(&p, 0);  /* no protocol, yet */
+        lenp = p;
+        p += sizeof(u32);
+        ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+        ceph_encode_8(&p, 1);
+        num = ARRAY_SIZE(supported_protocols);
+        ceph_encode_32(&p, num);
+        ceph_decode_need(&p, end, num * sizeof(u32), bad);
+        for (i = 0; i < num; i++)
+                ceph_encode_32(&p, supported_protocols[i]);
+        ret = ceph_entity_name_encode(ac->name, &p, end);
+        if (ret < 0)
+                return ret;
+        ceph_decode_need(&p, end, sizeof(u64), bad);
+        ceph_encode_64(&p, ac->global_id);
+        ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+        return p - buf;
+bad:
+        return -ERANGE;
+}
+int ceph_build_auth_request(struct ceph_auth_client *ac,
+                           void *msg_buf, size_t msg_len)
+{
+        struct ceph_mon_request_header *monhdr = msg_buf;
+        void *p = monhdr + 1;
+        void *end = msg_buf + msg_len;
+        int ret;
+        monhdr->have_version = 0;
+        monhdr->session_mon = cpu_to_le16(-1);
+        monhdr->session_mon_tid = 0;
+        ceph_encode_32(&p, ac->protocol);
+        ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+        if (ret < 0) {
+                pr_err("error %d building request\n", ret);
+                return ret;
+        }
+        dout(" built request %d bytes\n", ret);
+        ceph_encode_32(&p, ret);
+        return p + ret - msg_buf;
+}
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+                           void *buf, size_t len,
+                           void *reply_buf, size_t reply_len)
+{
+        void *p = buf;
+        void *end = buf + len;
+        int protocol;
+        s32 result;
+        u64 global_id;
+        void *payload, *payload_end;
+        int payload_len;
+        char *result_msg;
+        int result_msg_len;
+        int ret = -EINVAL;
+        dout("handle_auth_reply %p %p\n", p, end);
+        ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+        protocol = ceph_decode_32(&p);
+        result = ceph_decode_32(&p);
+        global_id = ceph_decode_64(&p);
+        payload_len = ceph_decode_32(&p);
+        payload = p;
+        p += payload_len;
+        ceph_decode_need(&p, end, sizeof(u32), bad);
+        result_msg_len = ceph_decode_32(&p);
+        result_msg = p;
+        p += result_msg_len;
+        if (p != end)
+                goto bad;
+        dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+             result_msg, global_id, payload_len);
+        payload_end = payload + payload_len;
+        if (global_id && ac->global_id != global_id) {
+                dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
+                ac->global_id = global_id;
+        }
+        if (ac->negotiating) {
+                /* server does not support our protocols? */
+                if (!protocol && result < 0) {
+                        ret = result;
+                        goto out;
+                }
+                /* set up (new) protocol handler? */
+                if (ac->protocol && ac->protocol != protocol) {
+                        ac->ops->destroy(ac);
+                        ac->protocol = 0;
+                        ac->ops = NULL;
+                }
+                if (ac->protocol != protocol) {
+                        ret = ceph_auth_init_protocol(ac, protocol);
+                        if (ret) {
+                                pr_err("error %d on auth protocol %d init\n",
+                                       ret, protocol);
+                                goto out;
+                        }
+                }
+                ac->negotiating = false;
+        }
+        ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+        if (ret == -EAGAIN) {
+                return ceph_build_auth_request(ac, reply_buf, reply_len);
+        } else if (ret) {
+                pr_err("authentication error %d\n", ret);
+                return ret;
+        }
+        return 0;
+bad:
+        pr_err("failed to decode auth msg\n");
+out:
+        return ret;
+}
+int ceph_build_auth(struct ceph_auth_client *ac,
+                    void *msg_buf, size_t msg_len)
+{
+        if (!ac->protocol)
+                return ceph_auth_build_hello(ac, msg_buf, msg_len);
+        BUG_ON(!ac->ops);
+        if (!ac->ops->is_authenticated(ac))
+                return ceph_build_auth_request(ac, msg_buf, msg_len);
+        return 0;
+}
+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
+{
+        if (!ac->ops)
+                return 0;
+        return ac->ops->is_authenticated(ac);
+}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..ca4f57cfb267
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
+#ifndef _FS_CEPH_AUTH_H
+#define _FS_CEPH_AUTH_H
+#include "types.h"
+#include "buffer.h"
+/*
+ * Abstract interface for communicating with the authenticate module.
+ * There is some handshake that takes place between us and the monitor
+ * to acquire the necessary keys.  These are used to generate an
+ * 'authorizer' that we use when connecting to a service (mds, osd).
+ */
+struct ceph_auth_client;
+struct ceph_authorizer;
+struct ceph_auth_client_ops {
+        /*
+         * true if we are authenticated and can connect to
+         * services.
+         */
+        int (*is_authenticated)(struct ceph_auth_client *ac);
+        /*
+         * build requests and process replies during monitor
+         * handshake.  if handle_reply returns -EAGAIN, we build
+         * another request.
+         */
+        int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
+        int (*handle_reply)(struct ceph_auth_client *ac, int result,
+                            void *buf, void *end);
+        /*
+         * Create authorizer for connecting to a service, and verify
+         * the response to authenticate the service.
+         */
+        int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
+                                 struct ceph_authorizer **a,
+                                 void **buf, size_t *len,
+                                 void **reply_buf, size_t *reply_len);
+        int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
+                                       struct ceph_authorizer *a, size_t len);
+        void (*destroy_authorizer)(struct ceph_auth_client *ac,
+                                   struct ceph_authorizer *a);
+        void (*invalidate_authorizer)(struct ceph_auth_client *ac,
+                                      int peer_type);
+        /* reset when we (re)connect to a monitor */
+        void (*reset)(struct ceph_auth_client *ac);
+        void (*destroy)(struct ceph_auth_client *ac);
+};
+struct ceph_auth_client {
+        u32 protocol;           /* CEPH_AUTH_* */
+        void *private;          /* for use by protocol implementation */
+        const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
+        bool negotiating;       /* true if negotiating protocol */
+        const char *name;       /* entity name */
+        u64 global_id;          /* our unique id in system */
+        const char *secret;     /* our secret key */
+        unsigned want_keys;     /* which services we want */
+};
+extern struct ceph_auth_client *ceph_auth_init(const char *name,
+                                               const char *secret);
+extern void ceph_auth_destroy(struct ceph_auth_client *ac);
+extern void ceph_auth_reset(struct ceph_auth_client *ac);
+extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
+                                 void *buf, size_t len);
+extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+                                  void *buf, size_t len,
+                                  void *reply_buf, size_t reply_len);
+extern int ceph_entity_name_encode(const char *name, void **p, void *end);
+extern int ceph_build_auth(struct ceph_auth_client *ac,
+                    void *msg_buf, size_t msg_len);
+extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..8cd9e3af07f7
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,122 @@
+#include "ceph_debug.h"
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include "auth_none.h"
+#include "auth.h"
+#include "decode.h"
+static void reset(struct ceph_auth_client *ac)
+{
+        struct ceph_auth_none_info *xi = ac->private;
+        xi->starting = true;
+        xi->built_authorizer = false;
+}
+static void destroy(struct ceph_auth_client *ac)
+{
+        kfree(ac->private);
+        ac->private = NULL;
+}
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+        struct ceph_auth_none_info *xi = ac->private;
+        return !xi->starting;
+}
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+                        void *buf, void *end)
+{
+        struct ceph_auth_none_info *xi = ac->private;
+        xi->starting = false;
+        return result;
+}
+/*
+ * build an 'authorizer' with our entity_name and global_id.  we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+        struct ceph_auth_client *ac, int peer_type,
+        struct ceph_authorizer **a,
+        void **buf, size_t *len,
+        void **reply_buf, size_t *reply_len)
+{
+        struct ceph_auth_none_info *ai = ac->private;
+        struct ceph_none_authorizer *au = &ai->au;
+        void *p, *end;
+        int ret;
+        if (!ai->built_authorizer) {
+                p = au->buf;
+                end = p + sizeof(au->buf);
+                ceph_encode_8(&p, 1);
+                ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+                if (ret < 0)
+                        goto bad;
+                ceph_decode_need(&p, end, sizeof(u64), bad2);
+                ceph_encode_64(&p, ac->global_id);
+                au->buf_len = p - (void *)au->buf;
+                ai->built_authorizer = true;
+                dout("built authorizer len %d\n", au->buf_len);
+        }
+        *a = (struct ceph_authorizer *)au;
+        *buf = au->buf;
+        *len = au->buf_len;
+        *reply_buf = au->reply_buf;
+        *reply_len = sizeof(au->reply_buf);
+        return 0;
+bad2:
+        ret = -ERANGE;
+bad:
+        return ret;
+}
+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
+                                      struct ceph_authorizer *a)
+{
+        /* nothing to do */
+}
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+        .reset = reset,
+        .destroy = destroy,
+        .is_authenticated = is_authenticated,
+        .handle_reply = handle_reply,
+        .create_authorizer = ceph_auth_none_create_authorizer,
+        .destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+        struct ceph_auth_none_info *xi;
+        dout("ceph_auth_none_init %p\n", ac);
+        xi = kzalloc(sizeof(*xi), GFP_NOFS);
+        if (!xi)
+                return -ENOMEM;
+        xi->starting = true;
+        xi->built_authorizer = false;
+        ac->protocol = CEPH_AUTH_NONE;
+        ac->private = xi;
+        ac->ops = &ceph_auth_none_ops;
+        return 0;
+}
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..8164df1a08be
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,30 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+#include <linux/slab.h>
+#include "auth.h"
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+struct ceph_none_authorizer {
+        char buf[128];
+        int buf_len;
+        char reply_buf[0];
+};
+struct ceph_auth_none_info {
+        bool starting;
+        bool built_authorizer;
+        struct ceph_none_authorizer au;   /* we only need one; it's static */
+};
+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
+#endif
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..fee5a08da881
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,668 @@
+#include "ceph_debug.h"
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+#include "crypto.h"
+#include "auth.h"
+#include "decode.h"
+#define TEMP_TICKET_BUF_LEN     256
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+        struct ceph_x_info *xi = ac->private;
+        int need;
+        ceph_x_validate_tickets(ac, &need);
+        dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
+             ac->want_keys, need, xi->have_keys);
+        return (ac->want_keys & xi->have_keys) == ac->want_keys;
+}
+static int ceph_x_encrypt_buflen(int ilen)
+{
+        return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
+                sizeof(u32);
+}
+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
+                          void *ibuf, int ilen, void *obuf, size_t olen)
+{
+        struct ceph_x_encrypt_header head = {
+                .struct_v = 1,
+                .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
+        };
+        size_t len = olen - sizeof(u32);
+        int ret;
+        ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
+                            &head, sizeof(head), ibuf, ilen);
+        if (ret)
+                return ret;
+        ceph_encode_32(&obuf, len);
+        return len + sizeof(u32);
+}
+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
+                          void **p, void *end, void *obuf, size_t olen)
+{
+        struct ceph_x_encrypt_header head;
+        size_t head_len = sizeof(head);
+        int len, ret;
+        len = ceph_decode_32(p);
+        if (*p + len > end)
+                return -EINVAL;
+        dout("ceph_x_decrypt len %d\n", len);
+        ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
+                            *p, len);
+        if (ret)
+                return ret;
+        if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+                return -EPERM;
+        *p += len;
+        return olen;
+}
+/*
+ * get existing (or insert new) ticket handler
+ */
+struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
+                                                 int service)
+{
+        struct ceph_x_ticket_handler *th;
+        struct ceph_x_info *xi = ac->private;
+        struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
+        while (*p) {
+                parent = *p;
+                th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+                if (service < th->service)
+                        p = &(*p)->rb_left;
+                else if (service > th->service)
+                        p = &(*p)->rb_right;
+                else
+                        return th;
+        }
+        /* add it */
+        th = kzalloc(sizeof(*th), GFP_NOFS);
+        if (!th)
+                return ERR_PTR(-ENOMEM);
+        th->service = service;
+        rb_link_node(&th->node, parent, p);
+        rb_insert_color(&th->node, &xi->ticket_handlers);
+        return th;
+}
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+                                  struct ceph_x_ticket_handler *th)
+{
+        struct ceph_x_info *xi = ac->private;
+        dout("remove_ticket_handler %p %d\n", th, th->service);
+        rb_erase(&th->node, &xi->ticket_handlers);
+        ceph_crypto_key_destroy(&th->session_key);
+        if (th->ticket_blob)
+                ceph_buffer_put(th->ticket_blob);
+        kfree(th);
+}
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+                                    struct ceph_crypto_key *secret,
+                                    void *buf, void *end)
+{
+        struct ceph_x_info *xi = ac->private;
+        int num;
+        void *p = buf;
+        int ret;
+        char *dbuf;
+        char *ticket_buf;
+        u8 struct_v;
+        dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+        if (!dbuf)
+                return -ENOMEM;
+        ret = -ENOMEM;
+        ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+        if (!ticket_buf)
+                goto out_dbuf;
+        ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+        struct_v = ceph_decode_8(&p);
+        if (struct_v != 1)
+                goto bad;
+        num = ceph_decode_32(&p);
+        dout("%d tickets\n", num);
+        while (num--) {
+                int type;
+                u8 struct_v;
+                struct ceph_x_ticket_handler *th;
+                void *dp, *dend;
+                int dlen;
+                char is_enc;
+                struct timespec validity;
+                struct ceph_crypto_key old_key;
+                void *tp, *tpend;
+                struct ceph_timespec new_validity;
+                struct ceph_crypto_key new_session_key;
+                struct ceph_buffer *new_ticket_blob;
+                unsigned long new_expires, new_renew_after;
+                u64 new_secret_id;
+                ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
+                type = ceph_decode_32(&p);
+                dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+                struct_v = ceph_decode_8(&p);
+                if (struct_v != 1)
+                        goto bad;
+                th = get_ticket_handler(ac, type);
+                if (IS_ERR(th)) {
+                        ret = PTR_ERR(th);
+                        goto out;
+                }
+                /* blob for me */
+                dlen = ceph_x_decrypt(secret, &p, end, dbuf,
+                                      TEMP_TICKET_BUF_LEN);
+                if (dlen <= 0) {
+                        ret = dlen;
+                        goto out;
+                }
+                dout(" decrypted %d bytes\n", dlen);
+                dend = dbuf + dlen;
+                dp = dbuf;
+                struct_v = ceph_decode_8(&dp);
+                if (struct_v != 1)
+                        goto bad;
+                memcpy(&old_key, &th->session_key, sizeof(old_key));
+                ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
+                if (ret)
+                        goto out;
+                ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
+                ceph_decode_timespec(&validity, &new_validity);
+                new_expires = get_seconds() + validity.tv_sec;
+                new_renew_after = new_expires - (validity.tv_sec / 4);
+                dout(" expires=%lu renew_after=%lu\n", new_expires,
+                     new_renew_after);
+                /* ticket blob for service */
+                ceph_decode_8_safe(&p, end, is_enc, bad);
+                tp = ticket_buf;
+                if (is_enc) {
+                        /* encrypted */
+                        dout(" encrypted ticket\n");
+                        dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
+                                              TEMP_TICKET_BUF_LEN);
+                        if (dlen < 0) {
+                                ret = dlen;
+                                goto out;
+                        }
+                        dlen = ceph_decode_32(&tp);
+                } else {
+                        /* unencrypted */
+                        ceph_decode_32_safe(&p, end, dlen, bad);
+                        ceph_decode_need(&p, end, dlen, bad);
+                        ceph_decode_copy(&p, ticket_buf, dlen);
+                }
+                tpend = tp + dlen;
+                dout(" ticket blob is %d bytes\n", dlen);
+                ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
+                struct_v = ceph_decode_8(&tp);
+                new_secret_id = ceph_decode_64(&tp);
+                ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
+                if (ret)
+                        goto out;
+                /* all is well, update our ticket */
+                ceph_crypto_key_destroy(&th->session_key);
+                if (th->ticket_blob)
+                        ceph_buffer_put(th->ticket_blob);
+                th->session_key = new_session_key;
+                th->ticket_blob = new_ticket_blob;
+                th->validity = new_validity;
+                th->secret_id = new_secret_id;
+                th->expires = new_expires;
+                th->renew_after = new_renew_after;
+                dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+                     type, ceph_entity_type_name(type), th->secret_id,
+                     (int)th->ticket_blob->vec.iov_len);
+                xi->have_keys |= th->service;
+        }
+        ret = 0;
+out:
+        kfree(ticket_buf);
+out_dbuf:
+        kfree(dbuf);
+        return ret;
+bad:
+        ret = -EINVAL;
+        goto out;
+}
+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+                                   struct ceph_x_ticket_handler *th,
+                                   struct ceph_x_authorizer *au)
+{
+        int maxlen;
+        struct ceph_x_authorize_a *msg_a;
+        struct ceph_x_authorize_b msg_b;
+        void *p, *end;
+        int ret;
+        int ticket_blob_len =
+                (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+        dout("build_authorizer for %s %p\n",
+             ceph_entity_type_name(th->service), au);
+        maxlen = sizeof(*msg_a) + sizeof(msg_b) +
+                ceph_x_encrypt_buflen(ticket_blob_len);
+        dout("  need len %d\n", maxlen);
+        if (au->buf && au->buf->alloc_len < maxlen) {
+                ceph_buffer_put(au->buf);
+                au->buf = NULL;
+        }
+        if (!au->buf) {
+                au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
+                if (!au->buf)
+                        return -ENOMEM;
+        }
+        au->service = th->service;
+        msg_a = au->buf->vec.iov_base;
+        msg_a->struct_v = 1;
+        msg_a->global_id = cpu_to_le64(ac->global_id);
+        msg_a->service_id = cpu_to_le32(th->service);
+        msg_a->ticket_blob.struct_v = 1;
+        msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+        msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+        if (ticket_blob_len) {
+                memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+                       th->ticket_blob->vec.iov_len);
+        }
+        dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+             le64_to_cpu(msg_a->ticket_blob.secret_id));
+        p = msg_a + 1;
+        p += ticket_blob_len;
+        end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+        get_random_bytes(&au->nonce, sizeof(au->nonce));
+        msg_b.struct_v = 1;
+        msg_b.nonce = cpu_to_le64(au->nonce);
+        ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
+                             p, end - p);
+        if (ret < 0)
+                goto out_buf;
+        p += ret;
+        au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+        dout(" built authorizer nonce %llx len %d\n", au->nonce,
+             (int)au->buf->vec.iov_len);
+        BUG_ON(au->buf->vec.iov_len > maxlen);
+        return 0;
+out_buf:
+        ceph_buffer_put(au->buf);
+        au->buf = NULL;
+        return ret;
+}
+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+                                void **p, void *end)
+{
+        ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+        ceph_encode_8(p, 1);
+        ceph_encode_64(p, th->secret_id);
+        if (th->ticket_blob) {
+                const char *buf = th->ticket_blob->vec.iov_base;
+                u32 len = th->ticket_blob->vec.iov_len;
+                ceph_encode_32_safe(p, end, len, bad);
+                ceph_encode_copy_safe(p, end, buf, len, bad);
+        } else {
+                ceph_encode_32_safe(p, end, 0, bad);
+        }
+        return 0;
+bad:
+        return -ERANGE;
+}
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+        int want = ac->want_keys;
+        struct ceph_x_info *xi = ac->private;
+        int service;
+        *pneed = ac->want_keys & ~(xi->have_keys);
+        for (service = 1; service <= want; service <<= 1) {
+                struct ceph_x_ticket_handler *th;
+                if (!(ac->want_keys & service))
+                        continue;
+                if (*pneed & service)
+                        continue;
+                th = get_ticket_handler(ac, service);
+                if (!th) {
+                        *pneed |= service;
+                        continue;
+                }
+                if (get_seconds() >= th->renew_after)
+                        *pneed |= service;
+                if (get_seconds() >= th->expires)
+                        xi->have_keys &= ~service;
+        }
+}
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+                                void *buf, void *end)
+{
+        struct ceph_x_info *xi = ac->private;
+        int need;
+        struct ceph_x_request_header *head = buf;
+        int ret;
+        struct ceph_x_ticket_handler *th =
+                get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+        ceph_x_validate_tickets(ac, &need);
+        dout("build_request want %x have %x need %x\n",
+             ac->want_keys, xi->have_keys, need);
+        if (need & CEPH_ENTITY_TYPE_AUTH) {
+                struct ceph_x_authenticate *auth = (void *)(head + 1);
+                void *p = auth + 1;
+                struct ceph_x_challenge_blob tmp;
+                char tmp_enc[40];
+                u64 *u;
+                if (p > end)
+                        return -ERANGE;
+                dout(" get_auth_session_key\n");
+                head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+                /* encrypt and hash */
+                get_random_bytes(&auth->client_challenge, sizeof(u64));
+                tmp.client_challenge = auth->client_challenge;
+                tmp.server_challenge = cpu_to_le64(xi->server_challenge);
+                ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
+                                     tmp_enc, sizeof(tmp_enc));
+                if (ret < 0)
+                        return ret;
+                auth->struct_v = 1;
+                auth->key = 0;
+                for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+                        auth->key ^= *u;
+                dout(" server_challenge %llx client_challenge %llx key %llx\n",
+                     xi->server_challenge, le64_to_cpu(auth->client_challenge),
+                     le64_to_cpu(auth->key));
+                /* now encode the old ticket if exists */
+                ret = ceph_x_encode_ticket(th, &p, end);
+                if (ret < 0)
+                        return ret;
+                return p - buf;
+        }
+        if (need) {
+                void *p = head + 1;
+                struct ceph_x_service_ticket_request *req;
+                if (p > end)
+                        return -ERANGE;
+                head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
+                BUG_ON(!th);
+                ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+                if (ret)
+                        return ret;
+                ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
+                                 xi->auth_authorizer.buf->vec.iov_len);
+                req = p;
+                req->keys = cpu_to_le32(need);
+                p += sizeof(*req);
+                return p - buf;
+        }
+        return 0;
+}
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+                               void *buf, void *end)
+{
+        struct ceph_x_info *xi = ac->private;
+        struct ceph_x_reply_header *head = buf;
+        struct ceph_x_ticket_handler *th;
+        int len = end - buf;
+        int op;
+        int ret;
+        if (result)
+                return result;  /* XXX hmm? */
+        if (xi->starting) {
+                /* it's a hello */
+                struct ceph_x_server_challenge *sc = buf;
+                if (len != sizeof(*sc))
+                        return -EINVAL;
+                xi->server_challenge = le64_to_cpu(sc->server_challenge);
+                dout("handle_reply got server challenge %llx\n",
+                     xi->server_challenge);
+                xi->starting = false;
+                xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+                return -EAGAIN;
+        }
+        op = le32_to_cpu(head->op);
+        result = le32_to_cpu(head->result);
+        dout("handle_reply op %d result %d\n", op, result);
+        switch (op) {
+        case CEPHX_GET_AUTH_SESSION_KEY:
+                /* verify auth key */
+                ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
+                                               buf + sizeof(*head), end);
+                break;
+        case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+                th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+                BUG_ON(!th);
+                ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+                                               buf + sizeof(*head), end);
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (ret)
+                return ret;
+        if (ac->want_keys == xi->have_keys)
+                return 0;
+        return -EAGAIN;
+}
+static int ceph_x_create_authorizer(
+        struct ceph_auth_client *ac, int peer_type,
+        struct ceph_authorizer **a,
+        void **buf, size_t *len,
+        void **reply_buf, size_t *reply_len)
+{
+        struct ceph_x_authorizer *au;
+        struct ceph_x_ticket_handler *th;
+        int ret;
+        th = get_ticket_handler(ac, peer_type);
+        if (IS_ERR(th))
+                return PTR_ERR(th);
+        au = kzalloc(sizeof(*au), GFP_NOFS);
+        if (!au)
+                return -ENOMEM;
+        ret = ceph_x_build_authorizer(ac, th, au);
+        if (ret) {
+                kfree(au);
+                return ret;
+        }
+        *a = (struct ceph_authorizer *)au;
+        *buf = au->buf->vec.iov_base;
+        *len = au->buf->vec.iov_len;
+        *reply_buf = au->reply_buf;
+        *reply_len = sizeof(au->reply_buf);
+        return 0;
+}
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+                                          struct ceph_authorizer *a, size_t len)
+{
+        struct ceph_x_authorizer *au = (void *)a;
+        struct ceph_x_ticket_handler *th;
+        int ret = 0;
+        struct ceph_x_authorize_reply reply;
+        void *p = au->reply_buf;
+        void *end = p + sizeof(au->reply_buf);
+        th = get_ticket_handler(ac, au->service);
+        if (!th)
+                return -EIO;  /* hrm! */
+        ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
+        if (ret < 0)
+                return ret;
+        if (ret != sizeof(reply))
+                return -EPERM;
+        if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+                ret = -EPERM;
+        else
+                ret = 0;
+        dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
+             au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+        return ret;
+}
+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
+                                      struct ceph_authorizer *a)
+{
+        struct ceph_x_authorizer *au = (void *)a;
+        ceph_buffer_put(au->buf);
+        kfree(au);
+}
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+        struct ceph_x_info *xi = ac->private;
+        dout("reset\n");
+        xi->starting = true;
+        xi->server_challenge = 0;
+}
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+        struct ceph_x_info *xi = ac->private;
+        struct rb_node *p;
+        dout("ceph_x_destroy %p\n", ac);
+        ceph_crypto_key_destroy(&xi->secret);
+        while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+                struct ceph_x_ticket_handler *th =
+                        rb_entry(p, struct ceph_x_ticket_handler, node);
+                remove_ticket_handler(ac, th);
+        }
+        kfree(ac->private);
+        ac->private = NULL;
+}
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+                                   int peer_type)
+{
+        struct ceph_x_ticket_handler *th;
+        th = get_ticket_handler(ac, peer_type);
+        if (th && !IS_ERR(th))
+                remove_ticket_handler(ac, th);
+}
+static const struct ceph_auth_client_ops ceph_x_ops = {
+        .is_authenticated = ceph_x_is_authenticated,
+        .build_request = ceph_x_build_request,
+        .handle_reply = ceph_x_handle_reply,
+        .create_authorizer = ceph_x_create_authorizer,
+        .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+        .destroy_authorizer = ceph_x_destroy_authorizer,
+        .invalidate_authorizer = ceph_x_invalidate_authorizer,
+        .reset =  ceph_x_reset,
+        .destroy = ceph_x_destroy,
+};
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+        struct ceph_x_info *xi;
+        int ret;
+        dout("ceph_x_init %p\n", ac);
+        ret = -ENOMEM;
+        xi = kzalloc(sizeof(*xi), GFP_NOFS);
+        if (!xi)
+                goto out;
+        ret = -EINVAL;
+        if (!ac->secret) {
+                pr_err("no secret set (for auth_x protocol)\n");
+                goto out_nomem;
+        }
+        ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
+        if (ret)
+                goto out_nomem;
+        xi->starting = true;
+        xi->ticket_handlers = RB_ROOT;
+        ac->protocol = CEPH_AUTH_CEPHX;
+        ac->private = xi;
+        ac->ops = &ceph_x_ops;
+        return 0;
+out_nomem:
+        kfree(xi);
+out:
+        return ret;
+}
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+#include <linux/rbtree.h>
+#include "crypto.h"
+#include "auth.h"
+#include "auth_x_protocol.h"
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+        struct rb_node node;
+        unsigned service;
+        struct ceph_crypto_key session_key;
+        struct ceph_timespec validity;
+        u64 secret_id;
+        struct ceph_buffer *ticket_blob;
+        unsigned long renew_after, expires;
+};
+struct ceph_x_authorizer {
+        struct ceph_buffer *buf;
+        unsigned service;
+        u64 nonce;
+        char reply_buf[128];  /* big enough for encrypted blob */
+};
+struct ceph_x_info {
+        struct ceph_crypto_key secret;
+        bool starting;
+        u64 server_challenge;
+        unsigned have_keys;
+        struct rb_root ticket_handlers;
+        struct ceph_x_authorizer auth_authorizer;
+};
+extern int ceph_x_init(struct ceph_auth_client *ac);
+#endif
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY          0x0400
+/* common bits */
+struct ceph_x_ticket_blob {
+        __u8 struct_v;
+        __le64 secret_id;
+        __le32 blob_len;
+        char blob[];
+} __attribute__ ((packed));
+/* common request/reply headers */
+struct ceph_x_request_header {
+        __le16 op;
+} __attribute__ ((packed));
+struct ceph_x_reply_header {
+        __le16 op;
+        __le32 result;
+} __attribute__ ((packed));
+/* authenticate handshake */
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+        __u8 struct_v;
+        __le64 server_challenge;
+} __attribute__ ((packed));
+struct ceph_x_authenticate {
+        __u8 struct_v;
+        __le64 client_challenge;
+        __le64 key;
+        /* ticket blob */
+} __attribute__ ((packed));
+struct ceph_x_service_ticket_request {
+        __u8 struct_v;
+        __le32 keys;
+} __attribute__ ((packed));
+struct ceph_x_challenge_blob {
+        __le64 server_challenge;
+        __le64 client_challenge;
+} __attribute__ ((packed));
+/* authorize handshake */
+/*
+ * The authorizer consists of two pieces:
+ *  a - service id, ticket blob
+ *  b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+        __u8 struct_v;
+        __le64 global_id;
+        __le32 service_id;
+        struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+struct ceph_x_authorize_b {
+        __u8 struct_v;
+        __le64 nonce;
+} __attribute__ ((packed));
+struct ceph_x_authorize_reply {
+        __u8 struct_v;
+        __le64 nonce_plus_one;
+} __attribute__ ((packed));
+/*
+ * encyption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+struct ceph_x_encrypt_header {
+        __u8 struct_v;
+        __le64 magic;
+} __attribute__ ((packed));
+#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..c67535d70aa6
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,81 @@
+#include "ceph_debug.h"
+#include <linux/slab.h>
+#include "buffer.h"
+#include "decode.h"
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
+{
+        struct ceph_buffer *b;
+        b = kmalloc(sizeof(*b), gfp);
+        if (!b)
+                return NULL;
+        b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+        if (b->vec.iov_base) {
+                b->is_vmalloc = false;
+        } else {
+                b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+                if (!b->vec.iov_base) {
+                        kfree(b);
+                        return NULL;
+                }
+                b->is_vmalloc = true;
+        }
+        kref_init(&b->kref);
+        b->alloc_len = len;
+        b->vec.iov_len = len;
+        dout("buffer_new %p\n", b);
+        return b;
+}
+void ceph_buffer_release(struct kref *kref)
+{
+        struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+        dout("buffer_release %p\n", b);
+        if (b->vec.iov_base) {
+                if (b->is_vmalloc)
+                        vfree(b->vec.iov_base);
+                else
+                        kfree(b->vec.iov_base);
+        }
+        kfree(b);
+}
+int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
+{
+        b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+        if (b->vec.iov_base) {
+                b->is_vmalloc = false;
+        } else {
+                b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+                b->is_vmalloc = true;
+        }
+        if (!b->vec.iov_base)
+                return -ENOMEM;
+        b->alloc_len = len;
+        b->vec.iov_len = len;
+        return 0;
+}
+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+        size_t len;
+        ceph_decode_need(p, end, sizeof(u32), bad);
+        len = ceph_decode_32(p);
+        dout("decode_buffer len %d\n", (int)len);
+        ceph_decode_need(p, end, len, bad);
+        *b = ceph_buffer_new(len, GFP_NOFS);
+        if (!*b)
+                return -ENOMEM;
+        ceph_decode_copy(p, (*b)->vec.iov_base, len);
+        return 0;
+bad:
+        return -EINVAL;
+}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
+#ifndef __FS_CEPH_BUFFER_H
+#define __FS_CEPH_BUFFER_H
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+/*
+ * a simple reference counted buffer.
+ *
+ * use kmalloc for small sizes (<= one page), vmalloc for larger
+ * sizes.
+ */
+struct ceph_buffer {
+        struct kref kref;
+        struct kvec vec;
+        size_t alloc_len;
+        bool is_vmalloc;
+};
+extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
+extern void ceph_buffer_release(struct kref *kref);
+static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
+{
+        kref_get(&b->kref);
+        return b;
+}
+static inline void ceph_buffer_put(struct ceph_buffer *b)
+{
+        kref_put(&b->kref, ceph_buffer_release);
+}
+extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
+#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..d9400534b279
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2960 @@
+#include "ceph_debug.h"
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include "super.h"
+#include "decode.h"
+#include "messenger.h"
+/*
+ * Capability management
+ *
+ * The Ceph metadata servers control client access to inode metadata
+ * and file data by issuing capabilities, granting clients permission
+ * to read and/or write both inode field and file data to OSDs
+ * (storage nodes).  Each capability consists of a set of bits
+ * indicating which operations are allowed.
+ *
+ * If the client holds a *_SHARED cap, the client has a coherent value
+ * that can be safely read from the cached inode.
+ *
+ * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
+ * client is allowed to change inode attributes (e.g., file size,
+ * mtime), note its dirty state in the ceph_cap, and asynchronously
+ * flush that metadata change to the MDS.
+ *
+ * In the event of a conflicting operation (perhaps by another
+ * client), the MDS will revoke the conflicting client capabilities.
+ *
+ * In order for a client to cache an inode, it must hold a capability
+ * with at least one MDS server.  When inodes are released, release
+ * notifications are batched and periodically sent en masse to the MDS
+ * cluster to release server state.
+ */
+/*
+ * Generate readable cap strings for debugging output.
+ */
+#define MAX_CAP_STR 20
+static char cap_str[MAX_CAP_STR][40];
+static DEFINE_SPINLOCK(cap_str_lock);
+static int last_cap_str;
+static char *gcap_string(char *s, int c)
+{
+        if (c & CEPH_CAP_GSHARED)
+                *s++ = 's';
+        if (c & CEPH_CAP_GEXCL)
+                *s++ = 'x';
+        if (c & CEPH_CAP_GCACHE)
+                *s++ = 'c';
+        if (c & CEPH_CAP_GRD)
+                *s++ = 'r';
+        if (c & CEPH_CAP_GWR)
+                *s++ = 'w';
+        if (c & CEPH_CAP_GBUFFER)
+                *s++ = 'b';
+        if (c & CEPH_CAP_GLAZYIO)
+                *s++ = 'l';
+        return s;
+}
+const char *ceph_cap_string(int caps)
+{
+        int i;
+        char *s;
+        int c;
+        spin_lock(&cap_str_lock);
+        i = last_cap_str++;
+        if (last_cap_str == MAX_CAP_STR)
+                last_cap_str = 0;
+        spin_unlock(&cap_str_lock);
+        s = cap_str[i];
+        if (caps & CEPH_CAP_PIN)
+                *s++ = 'p';
+        c = (caps >> CEPH_CAP_SAUTH) & 3;
+        if (c) {
+                *s++ = 'A';
+                s = gcap_string(s, c);
+        }
+        c = (caps >> CEPH_CAP_SLINK) & 3;
+        if (c) {
+                *s++ = 'L';
+                s = gcap_string(s, c);
+        }
+        c = (caps >> CEPH_CAP_SXATTR) & 3;
+        if (c) {
+                *s++ = 'X';
+                s = gcap_string(s, c);
+        }
+        c = caps >> CEPH_CAP_SFILE;
+        if (c) {
+                *s++ = 'F';
+                s = gcap_string(s, c);
+        }
+        if (s == cap_str[i])
+                *s++ = '-';
+        *s = 0;
+        return cap_str[i];
+}
+/*
+ * Cap reservations
+ *
+ * Maintain a global pool of preallocated struct ceph_caps, referenced
+ * by struct ceph_caps_reservations.  This ensures that we preallocate
+ * memory needed to successfully process an MDS response.  (If an MDS
+ * sends us cap information and we fail to process it, we will have
+ * problems due to the client and MDS being out of sync.)
+ *
+ * Reservations are 'owned' by a ceph_cap_reservation context.
+ */
+static spinlock_t caps_list_lock;
+static struct list_head caps_list;  /* unused (reserved or unreserved) */
+static int caps_total_count;        /* total caps allocated */
+static int caps_use_count;          /* in use */
+static int caps_reserve_count;      /* unused, reserved */
+static int caps_avail_count;        /* unused, unreserved */
+static int caps_min_count;          /* keep at least this many (unreserved) */
+void __init ceph_caps_init(void)
+{
+        INIT_LIST_HEAD(&caps_list);
+        spin_lock_init(&caps_list_lock);
+}
+void ceph_caps_finalize(void)
+{
+        struct ceph_cap *cap;
+        spin_lock(&caps_list_lock);
+        while (!list_empty(&caps_list)) {
+                cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+                list_del(&cap->caps_item);
+                kmem_cache_free(ceph_cap_cachep, cap);
+        }
+        caps_total_count = 0;
+        caps_avail_count = 0;
+        caps_use_count = 0;
+        caps_reserve_count = 0;
+        caps_min_count = 0;
+        spin_unlock(&caps_list_lock);
+}
+void ceph_adjust_min_caps(int delta)
+{
+        spin_lock(&caps_list_lock);
+        caps_min_count += delta;
+        BUG_ON(caps_min_count < 0);
+        spin_unlock(&caps_list_lock);
+}
+int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
+{
+        int i;
+        struct ceph_cap *cap;
+        int have;
+        int alloc = 0;
+        LIST_HEAD(newcaps);
+        int ret = 0;
+        dout("reserve caps ctx=%p need=%d\n", ctx, need);
+        /* first reserve any caps that are already allocated */
+        spin_lock(&caps_list_lock);
+        if (caps_avail_count >= need)
+                have = need;
+        else
+                have = caps_avail_count;
+        caps_avail_count -= have;
+        caps_reserve_count += have;
+        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+               caps_avail_count);
+        spin_unlock(&caps_list_lock);
+        for (i = have; i < need; i++) {
+                cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+                if (!cap) {
+                        ret = -ENOMEM;
+                        goto out_alloc_count;
+                }
+                list_add(&cap->caps_item, &newcaps);
+                alloc++;
+        }
+        BUG_ON(have + alloc != need);
+        spin_lock(&caps_list_lock);
+        caps_total_count += alloc;
+        caps_reserve_count += alloc;
+        list_splice(&newcaps, &caps_list);
+        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+               caps_avail_count);
+        spin_unlock(&caps_list_lock);
+        ctx->count = need;
+        dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
+             ctx, caps_total_count, caps_use_count, caps_reserve_count,
+             caps_avail_count);
+        return 0;
+out_alloc_count:
+        /* we didn't manage to reserve as much as we needed */
+        pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+                   ctx, need, have);
+        return ret;
+}
+int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
+{
+        dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+        if (ctx->count) {
+                spin_lock(&caps_list_lock);
+                BUG_ON(caps_reserve_count < ctx->count);
+                caps_reserve_count -= ctx->count;
+                caps_avail_count += ctx->count;
+                ctx->count = 0;
+                dout("unreserve caps %d = %d used + %d resv + %d avail\n",
+                     caps_total_count, caps_use_count, caps_reserve_count,
+                     caps_avail_count);
+                BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+                       caps_avail_count);
+                spin_unlock(&caps_list_lock);
+        }
+        return 0;
+}
+static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
+{
+        struct ceph_cap *cap = NULL;
+        /* temporary, until we do something about cap import/export */
+        if (!ctx)
+                return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+        spin_lock(&caps_list_lock);
+        dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
+             ctx, ctx->count, caps_total_count, caps_use_count,
+             caps_reserve_count, caps_avail_count);
+        BUG_ON(!ctx->count);
+        BUG_ON(ctx->count > caps_reserve_count);
+        BUG_ON(list_empty(&caps_list));
+        ctx->count--;
+        caps_reserve_count--;
+        caps_use_count++;
+        cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+        list_del(&cap->caps_item);
+        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+               caps_avail_count);
+        spin_unlock(&caps_list_lock);
+        return cap;
+}
+void ceph_put_cap(struct ceph_cap *cap)
+{
+        spin_lock(&caps_list_lock);
+        dout("put_cap %p %d = %d used + %d resv + %d avail\n",
+             cap, caps_total_count, caps_use_count,
+             caps_reserve_count, caps_avail_count);
+        caps_use_count--;
+        /*
+         * Keep some preallocated caps around (ceph_min_count), to
+         * avoid lots of free/alloc churn.
+         */
+        if (caps_avail_count >= caps_reserve_count + caps_min_count) {
+                caps_total_count--;
+                kmem_cache_free(ceph_cap_cachep, cap);
+        } else {
+                caps_avail_count++;
+                list_add(&cap->caps_item, &caps_list);
+        }
+        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+               caps_avail_count);
+        spin_unlock(&caps_list_lock);
+}
+void ceph_reservation_status(struct ceph_client *client,
+                             int *total, int *avail, int *used, int *reserved,
+                             int *min)
+{
+        if (total)
+                *total = caps_total_count;
+        if (avail)
+                *avail = caps_avail_count;
+        if (used)
+                *used = caps_use_count;
+        if (reserved)
+                *reserved = caps_reserve_count;
+        if (min)
+                *min = caps_min_count;
+}
+/*
+ * Find ceph_cap for given mds, if any.
+ *
+ * Called with i_lock held.
+ */
+static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+        struct ceph_cap *cap;
+        struct rb_node *n = ci->i_caps.rb_node;
+        while (n) {
+                cap = rb_entry(n, struct ceph_cap, ci_node);
+                if (mds < cap->mds)
+                        n = n->rb_left;
+                else if (mds > cap->mds)
+                        n = n->rb_right;
+                else
+                        return cap;
+        }
+        return NULL;
+}
+/*
+ * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
+ * -1.
+ */
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
+{
+        struct ceph_cap *cap;
+        int mds = -1;
+        struct rb_node *p;
+        /* prefer mds with WR|WRBUFFER|EXCL caps */
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                mds = cap->mds;
+                if (mseq)
+                        *mseq = cap->mseq;
+                if (cap->issued & (CEPH_CAP_FILE_WR |
+                                   CEPH_CAP_FILE_BUFFER |
+                                   CEPH_CAP_FILE_EXCL))
+                        break;
+        }
+        return mds;
+}
+int ceph_get_cap_mds(struct inode *inode)
+{
+        int mds;
+        spin_lock(&inode->i_lock);
+        mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
+        spin_unlock(&inode->i_lock);
+        return mds;
+}
+/*
+ * Called under i_lock.
+ */
+static void __insert_cap_node(struct ceph_inode_info *ci,
+                              struct ceph_cap *new)
+{
+        struct rb_node **p = &ci->i_caps.rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_cap *cap = NULL;
+        while (*p) {
+                parent = *p;
+                cap = rb_entry(parent, struct ceph_cap, ci_node);
+                if (new->mds < cap->mds)
+                        p = &(*p)->rb_left;
+                else if (new->mds > cap->mds)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&new->ci_node, parent, p);
+        rb_insert_color(&new->ci_node, &ci->i_caps);
+}
+/*
+ * (re)set cap hold timeouts, which control the delayed release
+ * of unused caps back to the MDS.  Should be called on cap use.
+ */
+static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
+                               struct ceph_inode_info *ci)
+{
+        struct ceph_mount_args *ma = mdsc->client->mount_args;
+        ci->i_hold_caps_min = round_jiffies(jiffies +
+                                            ma->caps_wanted_delay_min * HZ);
+        ci->i_hold_caps_max = round_jiffies(jiffies +
+                                            ma->caps_wanted_delay_max * HZ);
+        dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
+             ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+}
+/*
+ * (Re)queue cap at the end of the delayed cap release list.
+ *
+ * If I_FLUSH is set, leave the inode at the front of the list.
+ *
+ * Caller holds i_lock
+ *    -> we take mdsc->cap_delay_lock
+ */
+static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
+                                struct ceph_inode_info *ci)
+{
+        __cap_set_timeouts(mdsc, ci);
+        dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+             ci->i_ceph_flags, ci->i_hold_caps_max);
+        if (!mdsc->stopping) {
+                spin_lock(&mdsc->cap_delay_lock);
+                if (!list_empty(&ci->i_cap_delay_list)) {
+                        if (ci->i_ceph_flags & CEPH_I_FLUSH)
+                                goto no_change;
+                        list_del_init(&ci->i_cap_delay_list);
+                }
+                list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+no_change:
+                spin_unlock(&mdsc->cap_delay_lock);
+        }
+}
+/*
+ * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
+ * indicating we should send a cap message to flush dirty metadata
+ * asap, and move to the front of the delayed cap list.
+ */
+static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
+                                      struct ceph_inode_info *ci)
+{
+        dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+        spin_lock(&mdsc->cap_delay_lock);
+        ci->i_ceph_flags |= CEPH_I_FLUSH;
+        if (!list_empty(&ci->i_cap_delay_list))
+                list_del_init(&ci->i_cap_delay_list);
+        list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+        spin_unlock(&mdsc->cap_delay_lock);
+}
+/*
+ * Cancel delayed work on cap.
+ *
+ * Caller must hold i_lock.
+ */
+static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
+                               struct ceph_inode_info *ci)
+{
+        dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+        if (list_empty(&ci->i_cap_delay_list))
+                return;
+        spin_lock(&mdsc->cap_delay_lock);
+        list_del_init(&ci->i_cap_delay_list);
+        spin_unlock(&mdsc->cap_delay_lock);
+}
+/*
+ * Common issue checks for add_cap, handle_cap_grant.
+ */
+static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
+                              unsigned issued)
+{
+        unsigned had = __ceph_caps_issued(ci, NULL);
+        /*
+         * Each time we receive FILE_CACHE anew, we increment
+         * i_rdcache_gen.
+         */
+        if ((issued & CEPH_CAP_FILE_CACHE) &&
+            (had & CEPH_CAP_FILE_CACHE) == 0)
+                ci->i_rdcache_gen++;
+        /*
+         * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+         * don't know what happened to this directory while we didn't
+         * have the cap.
+         */
+        if ((issued & CEPH_CAP_FILE_SHARED) &&
+            (had & CEPH_CAP_FILE_SHARED) == 0) {
+                ci->i_shared_gen++;
+                if (S_ISDIR(ci->vfs_inode.i_mode)) {
+                        dout(" marking %p NOT complete\n", &ci->vfs_inode);
+                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+                }
+        }
+}
+/*
+ * Add a capability under the given MDS session.
+ *
+ * Caller should hold session snap_rwsem (read) and s_mutex.
+ *
+ * @fmode is the open file mode, if we are opening a file, otherwise
+ * it is < 0.  (This is so we can atomically add the cap and add an
+ * open file reference to it.)
+ */
+int ceph_add_cap(struct inode *inode,
+                 struct ceph_mds_session *session, u64 cap_id,
+                 int fmode, unsigned issued, unsigned wanted,
+                 unsigned seq, unsigned mseq, u64 realmino, int flags,
+                 struct ceph_cap_reservation *caps_reservation)
+{
+        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_cap *new_cap = NULL;
+        struct ceph_cap *cap;
+        int mds = session->s_mds;
+        int actual_wanted;
+        dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
+             session->s_mds, cap_id, ceph_cap_string(issued), seq);
+        /*
+         * If we are opening the file, include file mode wanted bits
+         * in wanted.
+         */
+        if (fmode >= 0)
+                wanted |= ceph_caps_for_mode(fmode);
+retry:
+        spin_lock(&inode->i_lock);
+        cap = __get_cap_for_mds(ci, mds);
+        if (!cap) {
+                if (new_cap) {
+                        cap = new_cap;
+                        new_cap = NULL;
+                } else {
+                        spin_unlock(&inode->i_lock);
+                        new_cap = get_cap(caps_reservation);
+                        if (new_cap == NULL)
+                                return -ENOMEM;
+                        goto retry;
+                }
+                cap->issued = 0;
+                cap->implemented = 0;
+                cap->mds = mds;
+                cap->mds_wanted = 0;
+                cap->ci = ci;
+                __insert_cap_node(ci, cap);
+                /* clear out old exporting info?  (i.e. on cap import) */
+                if (ci->i_cap_exporting_mds == mds) {
+                        ci->i_cap_exporting_issued = 0;
+                        ci->i_cap_exporting_mseq = 0;
+                        ci->i_cap_exporting_mds = -1;
+                }
+                /* add to session cap list */
+                cap->session = session;
+                spin_lock(&session->s_cap_lock);
+                list_add_tail(&cap->session_caps, &session->s_caps);
+                session->s_nr_caps++;
+                spin_unlock(&session->s_cap_lock);
+        }
+        if (!ci->i_snap_realm) {
+                /*
+                 * add this inode to the appropriate snap realm
+                 */
+                struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
+                                                               realmino);
+                if (realm) {
+                        ceph_get_snap_realm(mdsc, realm);
+                        spin_lock(&realm->inodes_with_caps_lock);
+                        ci->i_snap_realm = realm;
+                        list_add(&ci->i_snap_realm_item,
+                                 &realm->inodes_with_caps);
+                        spin_unlock(&realm->inodes_with_caps_lock);
+                } else {
+                        pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
+                               realmino);
+                }
+        }
+        __check_cap_issue(ci, cap, issued);
+        /*
+         * If we are issued caps we don't want, or the mds' wanted
+         * value appears to be off, queue a check so we'll release
+         * later and/or update the mds wanted value.
+         */
+        actual_wanted = __ceph_caps_wanted(ci);
+        if ((wanted & ~actual_wanted) ||
+            (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
+                dout(" issued %s, mds wanted %s, actual %s, queueing\n",
+                     ceph_cap_string(issued), ceph_cap_string(wanted),
+                     ceph_cap_string(actual_wanted));
+                __cap_delay_requeue(mdsc, ci);
+        }
+        if (flags & CEPH_CAP_FLAG_AUTH)
+                ci->i_auth_cap = cap;
+        else if (ci->i_auth_cap == cap)
+                ci->i_auth_cap = NULL;
+        dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
+             inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+             ceph_cap_string(issued|cap->issued), seq, mds);
+        cap->cap_id = cap_id;
+        cap->issued = issued;
+        cap->implemented |= issued;
+        cap->mds_wanted |= wanted;
+        cap->seq = seq;
+        cap->issue_seq = seq;
+        cap->mseq = mseq;
+        cap->cap_gen = session->s_cap_gen;
+        if (fmode >= 0)
+                __ceph_get_fmode(ci, fmode);
+        spin_unlock(&inode->i_lock);
+        wake_up(&ci->i_cap_wq);
+        return 0;
+}
+/*
+ * Return true if cap has not timed out and belongs to the current
+ * generation of the MDS session (i.e. has not gone 'stale' due to
+ * us losing touch with the mds).
+ */
+static int __cap_is_valid(struct ceph_cap *cap)
+{
+        unsigned long ttl;
+        u32 gen;
+        spin_lock(&cap->session->s_cap_lock);
+        gen = cap->session->s_cap_gen;
+        ttl = cap->session->s_cap_ttl;
+        spin_unlock(&cap->session->s_cap_lock);
+        if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
+                dout("__cap_is_valid %p cap %p issued %s "
+                     "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+                     cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
+                return 0;
+        }
+        return 1;
+}
+/*
+ * Return set of valid cap bits issued to us.  Note that caps time
+ * out, and may be invalidated in bulk if the client session times out
+ * and session->s_cap_gen is bumped.
+ */
+int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
+{
+        int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
+        struct ceph_cap *cap;
+        struct rb_node *p;
+        if (implemented)
+                *implemented = 0;
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                if (!__cap_is_valid(cap))
+                        continue;
+                dout("__ceph_caps_issued %p cap %p issued %s\n",
+                     &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+                have |= cap->issued;
+                if (implemented)
+                        *implemented |= cap->implemented;
+        }
+        return have;
+}
+/*
+ * Get cap bits issued by caps other than @ocap
+ */
+int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
+{
+        int have = ci->i_snap_caps;
+        struct ceph_cap *cap;
+        struct rb_node *p;
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                if (cap == ocap)
+                        continue;
+                if (!__cap_is_valid(cap))
+                        continue;
+                have |= cap->issued;
+        }
+        return have;
+}
+/*
+ * Move a cap to the end of the LRU (oldest caps at list head, newest
+ * at list tail).
+ */
+static void __touch_cap(struct ceph_cap *cap)
+{
+        struct ceph_mds_session *s = cap->session;
+        spin_lock(&s->s_cap_lock);
+        if (s->s_cap_iterator == NULL) {
+                dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+                     s->s_mds);
+                list_move_tail(&cap->session_caps, &s->s_caps);
+        } else {
+                dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
+                     &cap->ci->vfs_inode, cap, s->s_mds);
+        }
+        spin_unlock(&s->s_cap_lock);
+}
+/*
+ * Check if we hold the given mask.  If so, move the cap(s) to the
+ * front of their respective LRUs.  (This is the preferred way for
+ * callers to check for caps they want.)
+ */
+int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
+{
+        struct ceph_cap *cap;
+        struct rb_node *p;
+        int have = ci->i_snap_caps;
+        if ((have & mask) == mask) {
+                dout("__ceph_caps_issued_mask %p snap issued %s"
+                     " (mask %s)\n", &ci->vfs_inode,
+                     ceph_cap_string(have),
+                     ceph_cap_string(mask));
+                return 1;
+        }
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                if (!__cap_is_valid(cap))
+                        continue;
+                if ((cap->issued & mask) == mask) {
+                        dout("__ceph_caps_issued_mask %p cap %p issued %s"
+                             " (mask %s)\n", &ci->vfs_inode, cap,
+                             ceph_cap_string(cap->issued),
+                             ceph_cap_string(mask));
+                        if (touch)
+                                __touch_cap(cap);
+                        return 1;
+                }
+                /* does a combination of caps satisfy mask? */
+                have |= cap->issued;
+                if ((have & mask) == mask) {
+                        dout("__ceph_caps_issued_mask %p combo issued %s"
+                             " (mask %s)\n", &ci->vfs_inode,
+                             ceph_cap_string(cap->issued),
+                             ceph_cap_string(mask));
+                        if (touch) {
+                                struct rb_node *q;
+                                /* touch this + preceeding caps */
+                                __touch_cap(cap);
+                                for (q = rb_first(&ci->i_caps); q != p;
+                                     q = rb_next(q)) {
+                                        cap = rb_entry(q, struct ceph_cap,
+                                                       ci_node);
+                                        if (!__cap_is_valid(cap))
+                                                continue;
+                                        __touch_cap(cap);
+                                }
+                        }
+                        return 1;
+                }
+        }
+        return 0;
+}
+/*
+ * Return true if mask caps are currently being revoked by an MDS.
+ */
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+        struct inode *inode = &ci->vfs_inode;
+        struct ceph_cap *cap;
+        struct rb_node *p;
+        int ret = 0;
+        spin_lock(&inode->i_lock);
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                if (__cap_is_valid(cap) &&
+                    (cap->implemented & ~cap->issued & mask)) {
+                        ret = 1;
+                        break;
+                }
+        }
+        spin_unlock(&inode->i_lock);
+        dout("ceph_caps_revoking %p %s = %d\n", inode,
+             ceph_cap_string(mask), ret);
+        return ret;
+}
+int __ceph_caps_used(struct ceph_inode_info *ci)
+{
+        int used = 0;
+        if (ci->i_pin_ref)
+                used |= CEPH_CAP_PIN;
+        if (ci->i_rd_ref)
+                used |= CEPH_CAP_FILE_RD;
+        if (ci->i_rdcache_ref || ci->i_rdcache_gen)
+                used |= CEPH_CAP_FILE_CACHE;
+        if (ci->i_wr_ref)
+                used |= CEPH_CAP_FILE_WR;
+        if (ci->i_wrbuffer_ref)
+                used |= CEPH_CAP_FILE_BUFFER;
+        return used;
+}
+/*
+ * wanted, by virtue of open file modes
+ */
+int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
+{
+        int want = 0;
+        int mode;
+        for (mode = 0; mode < 4; mode++)
+                if (ci->i_nr_by_mode[mode])
+                        want |= ceph_caps_for_mode(mode);
+        return want;
+}
+/*
+ * Return caps we have registered with the MDS(s) as 'wanted'.
+ */
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+{
+        struct ceph_cap *cap;
+        struct rb_node *p;
+        int mds_wanted = 0;
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                if (!__cap_is_valid(cap))
+                        continue;
+                mds_wanted |= cap->mds_wanted;
+        }
+        return mds_wanted;
+}
+/*
+ * called under i_lock
+ */
+static int __ceph_is_any_caps(struct ceph_inode_info *ci)
+{
+        return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+}
+/*
+ * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
+ *
+ * caller should hold i_lock.
+ * caller will not hold session s_mutex if called from destroy_inode.
+ */
+void __ceph_remove_cap(struct ceph_cap *cap)
+{
+        struct ceph_mds_session *session = cap->session;
+        struct ceph_inode_info *ci = cap->ci;
+        struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+        int removed = 0;
+        dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+        /* remove from session list */
+        spin_lock(&session->s_cap_lock);
+        if (session->s_cap_iterator == cap) {
+                /* not yet, we are iterating over this very cap */
+                dout("__ceph_remove_cap  delaying %p removal from session %p\n",
+                     cap, cap->session);
+        } else {
+                list_del_init(&cap->session_caps);
+                session->s_nr_caps--;
+                cap->session = NULL;
+                removed = 1;
+        }
+        /* protect backpointer with s_cap_lock: see iterate_session_caps */
+        cap->ci = NULL;
+        spin_unlock(&session->s_cap_lock);
+        /* remove from inode list */
+        rb_erase(&cap->ci_node, &ci->i_caps);
+        if (ci->i_auth_cap == cap)
+                ci->i_auth_cap = NULL;
+        if (removed)
+                ceph_put_cap(cap);
+        if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
+                struct ceph_snap_realm *realm = ci->i_snap_realm;
+                spin_lock(&realm->inodes_with_caps_lock);
+                list_del_init(&ci->i_snap_realm_item);
+                ci->i_snap_realm_counter++;
+                ci->i_snap_realm = NULL;
+                spin_unlock(&realm->inodes_with_caps_lock);
+                ceph_put_snap_realm(mdsc, realm);
+        }
+        if (!__ceph_is_any_real_caps(ci))
+                __cap_delay_cancel(mdsc, ci);
+}
+/*
+ * Build and send a cap message to the given MDS.
+ *
+ * Caller should be holding s_mutex.
+ */
+static int send_cap_msg(struct ceph_mds_session *session,
+                        u64 ino, u64 cid, int op,
+                        int caps, int wanted, int dirty,
+                        u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
+                        u64 size, u64 max_size,
+                        struct timespec *mtime, struct timespec *atime,
+                        u64 time_warp_seq,
+                        uid_t uid, gid_t gid, mode_t mode,
+                        u64 xattr_version,
+                        struct ceph_buffer *xattrs_buf,
+                        u64 follows)
+{
+        struct ceph_mds_caps *fc;
+        struct ceph_msg *msg;
+        dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
+             " seq %u/%u mseq %u follows %lld size %llu/%llu"
+             " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
+             cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
+             ceph_cap_string(dirty),
+             seq, issue_seq, mseq, follows, size, max_size,
+             xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
+        if (IS_ERR(msg))
+                return PTR_ERR(msg);
+        msg->hdr.tid = cpu_to_le64(flush_tid);
+        fc = msg->front.iov_base;
+        memset(fc, 0, sizeof(*fc));
+        fc->cap_id = cpu_to_le64(cid);
+        fc->op = cpu_to_le32(op);
+        fc->seq = cpu_to_le32(seq);
+        fc->issue_seq = cpu_to_le32(issue_seq);
+        fc->migrate_seq = cpu_to_le32(mseq);
+        fc->caps = cpu_to_le32(caps);
+        fc->wanted = cpu_to_le32(wanted);
+        fc->dirty = cpu_to_le32(dirty);
+        fc->ino = cpu_to_le64(ino);
+        fc->snap_follows = cpu_to_le64(follows);
+        fc->size = cpu_to_le64(size);
+        fc->max_size = cpu_to_le64(max_size);
+        if (mtime)
+                ceph_encode_timespec(&fc->mtime, mtime);
+        if (atime)
+                ceph_encode_timespec(&fc->atime, atime);
+        fc->time_warp_seq = cpu_to_le32(time_warp_seq);
+        fc->uid = cpu_to_le32(uid);
+        fc->gid = cpu_to_le32(gid);
+        fc->mode = cpu_to_le32(mode);
+        fc->xattr_version = cpu_to_le64(xattr_version);
+        if (xattrs_buf) {
+                msg->middle = ceph_buffer_get(xattrs_buf);
+                fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+                msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+        }
+        ceph_con_send(&session->s_con, msg);
+        return 0;
+}
+/*
+ * Queue cap releases when an inode is dropped from our cache.  Since
+ * inode is about to be destroyed, there is no need for i_lock.
+ */
+void ceph_queue_caps_release(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct rb_node *p;
+        p = rb_first(&ci->i_caps);
+        while (p) {
+                struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+                struct ceph_mds_session *session = cap->session;
+                struct ceph_msg *msg;
+                struct ceph_mds_cap_release *head;
+                struct ceph_mds_cap_item *item;
+                spin_lock(&session->s_cap_lock);
+                BUG_ON(!session->s_num_cap_releases);
+                msg = list_first_entry(&session->s_cap_releases,
+                                       struct ceph_msg, list_head);
+                dout(" adding %p release to mds%d msg %p (%d left)\n",
+                     inode, session->s_mds, msg, session->s_num_cap_releases);
+                BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+                head = msg->front.iov_base;
+                head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+                item = msg->front.iov_base + msg->front.iov_len;
+                item->ino = cpu_to_le64(ceph_ino(inode));
+                item->cap_id = cpu_to_le64(cap->cap_id);
+                item->migrate_seq = cpu_to_le32(cap->mseq);
+                item->seq = cpu_to_le32(cap->issue_seq);
+                session->s_num_cap_releases--;
+                msg->front.iov_len += sizeof(*item);
+                if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+                        dout(" release msg %p full\n", msg);
+                        list_move_tail(&msg->list_head,
+                                       &session->s_cap_releases_done);
+                } else {
+                        dout(" release msg %p at %d/%d (%d)\n", msg,
+                             (int)le32_to_cpu(head->num),
+                             (int)CEPH_CAPS_PER_RELEASE,
+                             (int)msg->front.iov_len);
+                }
+                spin_unlock(&session->s_cap_lock);
+                p = rb_next(p);
+                __ceph_remove_cap(cap);
+        }
+}
+/*
+ * Send a cap msg on the given inode.  Update our caps state, then
+ * drop i_lock and send the message.
+ *
+ * Make note of max_size reported/requested from mds, revoked caps
+ * that have now been implemented.
+ *
+ * Make half-hearted attempt ot to invalidate page cache if we are
+ * dropping RDCACHE.  Note that this will leave behind locked pages
+ * that we'll then need to deal with elsewhere.
+ *
+ * Return non-zero if delayed release, or we experienced an error
+ * such that the caller should requeue + retry later.
+ *
+ * called with i_lock, then drops it.
+ * caller should hold snap_rwsem (read), s_mutex.
+ */
+static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+                      int op, int used, int want, int retain, int flushing,
+                      unsigned *pflush_tid)
+        __releases(cap->ci->vfs_inode->i_lock)
+{
+        struct ceph_inode_info *ci = cap->ci;
+        struct inode *inode = &ci->vfs_inode;
+        u64 cap_id = cap->cap_id;
+        int held, revoking, dropping, keep;
+        u64 seq, issue_seq, mseq, time_warp_seq, follows;
+        u64 size, max_size;
+        struct timespec mtime, atime;
+        int wake = 0;
+        mode_t mode;
+        uid_t uid;
+        gid_t gid;
+        struct ceph_mds_session *session;
+        u64 xattr_version = 0;
+        int delayed = 0;
+        u64 flush_tid = 0;
+        int i;
+        int ret;
+        held = cap->issued | cap->implemented;
+        revoking = cap->implemented & ~cap->issued;
+        retain &= ~revoking;
+        dropping = cap->issued & ~retain;
+        dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
+             inode, cap, cap->session,
+             ceph_cap_string(held), ceph_cap_string(held & retain),
+             ceph_cap_string(revoking));
+        BUG_ON((retain & CEPH_CAP_PIN) == 0);
+        session = cap->session;
+        /* don't release wanted unless we've waited a bit. */
+        if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+            time_before(jiffies, ci->i_hold_caps_min)) {
+                dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
+                     ceph_cap_string(cap->issued),
+                     ceph_cap_string(cap->issued & retain),
+                     ceph_cap_string(cap->mds_wanted),
+                     ceph_cap_string(want));
+                want |= cap->mds_wanted;
+                retain |= cap->issued;
+                delayed = 1;
+        }
+        ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+        cap->issued &= retain;  /* drop bits we don't want */
+        if (cap->implemented & ~cap->issued) {
+                /*
+                 * Wake up any waiters on wanted -> needed transition.
+                 * This is due to the weird transition from buffered
+                 * to sync IO... we need to flush dirty pages _before_
+                 * allowing sync writes to avoid reordering.
+                 */
+                wake = 1;
+        }
+        cap->implemented &= cap->issued | used;
+        cap->mds_wanted = want;
+        if (flushing) {
+                /*
+                 * assign a tid for flush operations so we can avoid
+                 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
+                 * clean type races.  track latest tid for every bit
+                 * so we can handle flush AxFw, flush Fw, and have the
+                 * first ack clean Ax.
+                 */
+                flush_tid = ++ci->i_cap_flush_last_tid;
+                if (pflush_tid)
+                        *pflush_tid = flush_tid;
+                dout(" cap_flush_tid %d\n", (int)flush_tid);
+                for (i = 0; i < CEPH_CAP_BITS; i++)
+                        if (flushing & (1 << i))
+                                ci->i_cap_flush_tid[i] = flush_tid;
+        }
+        keep = cap->implemented;
+        seq = cap->seq;
+        issue_seq = cap->issue_seq;
+        mseq = cap->mseq;
+        size = inode->i_size;
+        ci->i_reported_size = size;
+        max_size = ci->i_wanted_max_size;
+        ci->i_requested_max_size = max_size;
+        mtime = inode->i_mtime;
+        atime = inode->i_atime;
+        time_warp_seq = ci->i_time_warp_seq;
+        follows = ci->i_snap_realm->cached_context->seq;
+        uid = inode->i_uid;
+        gid = inode->i_gid;
+        mode = inode->i_mode;
+        if (dropping & CEPH_CAP_XATTR_EXCL) {
+                __ceph_build_xattrs_blob(ci);
+                xattr_version = ci->i_xattrs.version + 1;
+        }
+        spin_unlock(&inode->i_lock);
+        ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
+                op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+                size, max_size, &mtime, &atime, time_warp_seq,
+                uid, gid, mode,
+                xattr_version,
+                (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
+                follows);
+        if (ret < 0) {
+                dout("error sending cap msg, must requeue %p\n", inode);
+                delayed = 1;
+        }
+        if (wake)
+                wake_up(&ci->i_cap_wq);
+        return delayed;
+}
+/*
+ * When a snapshot is taken, clients accumulate dirty metadata on
+ * inodes with capabilities in ceph_cap_snaps to describe the file
+ * state at the time the snapshot was taken.  This must be flushed
+ * asynchronously back to the MDS once sync writes complete and dirty
+ * data is written out.
+ *
+ * Called under i_lock.  Takes s_mutex as needed.
+ */
+void __ceph_flush_snaps(struct ceph_inode_info *ci,
+                        struct ceph_mds_session **psession)
+{
+        struct inode *inode = &ci->vfs_inode;
+        int mds;
+        struct ceph_cap_snap *capsnap;
+        u32 mseq;
+        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
+                                                    session->s_mutex */
+        u64 next_follows = 0;  /* keep track of how far we've gotten through the
+                             i_cap_snaps list, and skip these entries next time
+                             around to avoid an infinite loop */
+        if (psession)
+                session = *psession;
+        dout("__flush_snaps %p\n", inode);
+retry:
+        list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+                /* avoid an infiniute loop after retry */
+                if (capsnap->follows < next_follows)
+                        continue;
+                /*
+                 * we need to wait for sync writes to complete and for dirty
+                 * pages to be written out.
+                 */
+                if (capsnap->dirty_pages || capsnap->writing)
+                        continue;
+                /*
+                 * if cap writeback already occurred, we should have dropped
+                 * the capsnap in ceph_put_wrbuffer_cap_refs.
+                 */
+                BUG_ON(capsnap->dirty == 0);
+                /* pick mds, take s_mutex */
+                mds = __ceph_get_cap_mds(ci, &mseq);
+                if (session && session->s_mds != mds) {
+                        dout("oops, wrong session %p mutex\n", session);
+                        mutex_unlock(&session->s_mutex);
+                        ceph_put_mds_session(session);
+                        session = NULL;
+                }
+                if (!session) {
+                        spin_unlock(&inode->i_lock);
+                        mutex_lock(&mdsc->mutex);
+                        session = __ceph_lookup_mds_session(mdsc, mds);
+                        mutex_unlock(&mdsc->mutex);
+                        if (session) {
+                                dout("inverting session/ino locks on %p\n",
+                                     session);
+                                mutex_lock(&session->s_mutex);
+                        }
+                        /*
+                         * if session == NULL, we raced against a cap
+                         * deletion.  retry, and we'll get a better
+                         * @mds value next time.
+                         */
+                        spin_lock(&inode->i_lock);
+                        goto retry;
+                }
+                capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+                atomic_inc(&capsnap->nref);
+                if (!list_empty(&capsnap->flushing_item))
+                        list_del_init(&capsnap->flushing_item);
+                list_add_tail(&capsnap->flushing_item,
+                              &session->s_cap_snaps_flushing);
+                spin_unlock(&inode->i_lock);
+                dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
+                     inode, capsnap, next_follows, capsnap->size);
+                send_cap_msg(session, ceph_vino(inode).ino, 0,
+                             CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+                             capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
+                             capsnap->size, 0,
+                             &capsnap->mtime, &capsnap->atime,
+                             capsnap->time_warp_seq,
+                             capsnap->uid, capsnap->gid, capsnap->mode,
+                             0, NULL,
+                             capsnap->follows);
+                next_follows = capsnap->follows + 1;
+                ceph_put_cap_snap(capsnap);
+                spin_lock(&inode->i_lock);
+                goto retry;
+        }
+        /* we flushed them all; remove this inode from the queue */
+        spin_lock(&mdsc->snap_flush_lock);
+        list_del_init(&ci->i_snap_flush_item);
+        spin_unlock(&mdsc->snap_flush_lock);
+        if (psession)
+                *psession = session;
+        else if (session) {
+                mutex_unlock(&session->s_mutex);
+                ceph_put_mds_session(session);
+        }
+}
+static void ceph_flush_snaps(struct ceph_inode_info *ci)
+{
+        struct inode *inode = &ci->vfs_inode;
+        spin_lock(&inode->i_lock);
+        __ceph_flush_snaps(ci, NULL);
+        spin_unlock(&inode->i_lock);
+}
+/*
+ * Mark caps dirty.  If inode is newly dirty, add to the global dirty
+ * list.
+ */
+void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+{
+        struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+        struct inode *inode = &ci->vfs_inode;
+        int was = ci->i_dirty_caps;
+        int dirty = 0;
+        dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+             ceph_cap_string(mask), ceph_cap_string(was),
+             ceph_cap_string(was | mask));
+        ci->i_dirty_caps |= mask;
+        if (was == 0) {
+                dout(" inode %p now dirty\n", &ci->vfs_inode);
+                BUG_ON(!list_empty(&ci->i_dirty_item));
+                spin_lock(&mdsc->cap_dirty_lock);
+                list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+                spin_unlock(&mdsc->cap_dirty_lock);
+                if (ci->i_flushing_caps == 0) {
+                        igrab(inode);
+                        dirty |= I_DIRTY_SYNC;
+                }
+        }
+        BUG_ON(list_empty(&ci->i_dirty_item));
+        if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
+            (mask & CEPH_CAP_FILE_BUFFER))
+                dirty |= I_DIRTY_DATASYNC;
+        if (dirty)
+                __mark_inode_dirty(inode, dirty);
+        __cap_delay_requeue(mdsc, ci);
+}
+/*
+ * Add dirty inode to the flushing list.  Assigned a seq number so we
+ * can wait for caps to flush without starving.
+ *
+ * Called under i_lock.
+ */
+static int __mark_caps_flushing(struct inode *inode,
+                                 struct ceph_mds_session *session)
+{
+        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int flushing;
+        BUG_ON(ci->i_dirty_caps == 0);
+        BUG_ON(list_empty(&ci->i_dirty_item));
+        flushing = ci->i_dirty_caps;
+        dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
+             ceph_cap_string(flushing),
+             ceph_cap_string(ci->i_flushing_caps),
+             ceph_cap_string(ci->i_flushing_caps | flushing));
+        ci->i_flushing_caps |= flushing;
+        ci->i_dirty_caps = 0;
+        dout(" inode %p now !dirty\n", inode);
+        spin_lock(&mdsc->cap_dirty_lock);
+        list_del_init(&ci->i_dirty_item);
+        ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
+        if (list_empty(&ci->i_flushing_item)) {
+                list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+                mdsc->num_cap_flushing++;
+                dout(" inode %p now flushing seq %lld\n", inode,
+                     ci->i_cap_flush_seq);
+        } else {
+                list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+                dout(" inode %p now flushing (more) seq %lld\n", inode,
+                     ci->i_cap_flush_seq);
+        }
+        spin_unlock(&mdsc->cap_dirty_lock);
+        return flushing;
+}
+/*
+ * try to invalidate mapping pages without blocking.
+ */
+static int mapping_is_empty(struct address_space *mapping)
+{
+        struct page *page = find_get_page(mapping, 0);
+        if (!page)
+                return 1;
+        put_page(page);
+        return 0;
+}
+static int try_nonblocking_invalidate(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        u32 invalidating_gen = ci->i_rdcache_gen;
+        spin_unlock(&inode->i_lock);
+        invalidate_mapping_pages(&inode->i_data, 0, -1);
+        spin_lock(&inode->i_lock);
+        if (mapping_is_empty(&inode->i_data) &&
+            invalidating_gen == ci->i_rdcache_gen) {
+                /* success. */
+                dout("try_nonblocking_invalidate %p success\n", inode);
+                ci->i_rdcache_gen = 0;
+                ci->i_rdcache_revoking = 0;
+                return 0;
+        }
+        dout("try_nonblocking_invalidate %p failed\n", inode);
+        return -1;
+}
+/*
+ * Swiss army knife function to examine currently used and wanted
+ * versus held caps.  Release, flush, ack revoked caps to mds as
+ * appropriate.
+ *
+ *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
+ *    cap release further.
+ *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
+ *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
+ *    further delay.
+ */
+void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+                     struct ceph_mds_session *session)
+        __releases(session->s_mutex)
+{
+        struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct inode *inode = &ci->vfs_inode;
+        struct ceph_cap *cap;
+        int file_wanted, used;
+        int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
+        int issued, implemented, want, retain, revoking, flushing = 0;
+        int mds = -1;   /* keep track of how far we've gone through i_caps list
+                           to avoid an infinite loop on retry */
+        struct rb_node *p;
+        int tried_invalidate = 0;
+        int delayed = 0, sent = 0, force_requeue = 0, num;
+        int queue_invalidate = 0;
+        int is_delayed = flags & CHECK_CAPS_NODELAY;
+        /* if we are unmounting, flush any unused caps immediately. */
+        if (mdsc->stopping)
+                is_delayed = 1;
+        spin_lock(&inode->i_lock);
+        if (ci->i_ceph_flags & CEPH_I_FLUSH)
+                flags |= CHECK_CAPS_FLUSH;
+        /* flush snaps first time around only */
+        if (!list_empty(&ci->i_cap_snaps))
+                __ceph_flush_snaps(ci, &session);
+        goto retry_locked;
+retry:
+        spin_lock(&inode->i_lock);
+retry_locked:
+        file_wanted = __ceph_caps_file_wanted(ci);
+        used = __ceph_caps_used(ci);
+        want = file_wanted | used;
+        issued = __ceph_caps_issued(ci, &implemented);
+        revoking = implemented & ~issued;
+        retain = want | CEPH_CAP_PIN;
+        if (!mdsc->stopping && inode->i_nlink > 0) {
+                if (want) {
+                        retain |= CEPH_CAP_ANY;       /* be greedy */
+                } else {
+                        retain |= CEPH_CAP_ANY_SHARED;
+                        /*
+                         * keep RD only if we didn't have the file open RW,
+                         * because then the mds would revoke it anyway to
+                         * journal max_size=0.
+                         */
+                        if (ci->i_max_size == 0)
+                                retain |= CEPH_CAP_ANY_RD;
+                }
+        }
+        dout("check_caps %p file_want %s used %s dirty %s flushing %s"
+             " issued %s revoking %s retain %s %s%s%s\n", inode,
+             ceph_cap_string(file_wanted),
+             ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
+             ceph_cap_string(ci->i_flushing_caps),
+             ceph_cap_string(issued), ceph_cap_string(revoking),
+             ceph_cap_string(retain),
+             (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
+             (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
+             (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+        /*
+         * If we no longer need to hold onto old our caps, and we may
+         * have cached pages, but don't want them, then try to invalidate.
+         * If we fail, it's because pages are locked.... try again later.
+         */
+        if ((!is_delayed || mdsc->stopping) &&
+            ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
+            ci->i_rdcache_gen &&                     /* may have cached pages */
+            (file_wanted == 0 ||                     /* no open files */
+             (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
+            !tried_invalidate) {
+                dout("check_caps trying to invalidate on %p\n", inode);
+                if (try_nonblocking_invalidate(inode) < 0) {
+                        if (revoking & CEPH_CAP_FILE_CACHE) {
+                                dout("check_caps queuing invalidate\n");
+                                queue_invalidate = 1;
+                                ci->i_rdcache_revoking = ci->i_rdcache_gen;
+                        } else {
+                                dout("check_caps failed to invalidate pages\n");
+                                /* we failed to invalidate pages.  check these
+                                   caps again later. */
+                                force_requeue = 1;
+                                __cap_set_timeouts(mdsc, ci);
+                        }
+                }
+                tried_invalidate = 1;
+                goto retry_locked;
+        }
+        num = 0;
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                num++;
+                /* avoid looping forever */
+                if (mds >= cap->mds ||
+                    ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
+                        continue;
+                /* NOTE: no side-effects allowed, until we take s_mutex */
+                revoking = cap->implemented & ~cap->issued;
+                if (revoking)
+                        dout(" mds%d revoking %s\n", cap->mds,
+                             ceph_cap_string(revoking));
+                if (cap == ci->i_auth_cap &&
+                    (cap->issued & CEPH_CAP_FILE_WR)) {
+                        /* request larger max_size from MDS? */
+                        if (ci->i_wanted_max_size > ci->i_max_size &&
+                            ci->i_wanted_max_size > ci->i_requested_max_size) {
+                                dout("requesting new max_size\n");
+                                goto ack;
+                        }
+                        /* approaching file_max? */
+                        if ((inode->i_size << 1) >= ci->i_max_size &&
+                            (ci->i_reported_size << 1) < ci->i_max_size) {
+                                dout("i_size approaching max_size\n");
+                                goto ack;
+                        }
+                }
+                /* flush anything dirty? */
+                if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
+                    ci->i_dirty_caps) {
+                        dout("flushing dirty caps\n");
+                        goto ack;
+                }
+                /* completed revocation? going down and there are no caps? */
+                if (revoking && (revoking & used) == 0) {
+                        dout("completed revocation of %s\n",
+                             ceph_cap_string(cap->implemented & ~cap->issued));
+                        goto ack;
+                }
+                /* want more caps from mds? */
+                if (want & ~(cap->mds_wanted | cap->issued))
+                        goto ack;
+                /* things we might delay */
+                if ((cap->issued & ~retain) == 0 &&
+                    cap->mds_wanted == want)
+                        continue;     /* nope, all good */
+                if (is_delayed)
+                        goto ack;
+                /* delay? */
+                if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+                    time_before(jiffies, ci->i_hold_caps_max)) {
+                        dout(" delaying issued %s -> %s, wanted %s -> %s\n",
+                             ceph_cap_string(cap->issued),
+                             ceph_cap_string(cap->issued & retain),
+                             ceph_cap_string(cap->mds_wanted),
+                             ceph_cap_string(want));
+                        delayed++;
+                        continue;
+                }
+ack:
+                if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+                        dout(" skipping %p I_NOFLUSH set\n", inode);
+                        continue;
+                }
+                if (session && session != cap->session) {
+                        dout("oops, wrong session %p mutex\n", session);
+                        mutex_unlock(&session->s_mutex);
+                        session = NULL;
+                }
+                if (!session) {
+                        session = cap->session;
+                        if (mutex_trylock(&session->s_mutex) == 0) {
+                                dout("inverting session/ino locks on %p\n",
+                                     session);
+                                spin_unlock(&inode->i_lock);
+                                if (took_snap_rwsem) {
+                                        up_read(&mdsc->snap_rwsem);
+                                        took_snap_rwsem = 0;
+                                }
+                                mutex_lock(&session->s_mutex);
+                                goto retry;
+                        }
+                }
+                /* take snap_rwsem after session mutex */
+                if (!took_snap_rwsem) {
+                        if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
+                                dout("inverting snap/in locks on %p\n",
+                                     inode);
+                                spin_unlock(&inode->i_lock);
+                                down_read(&mdsc->snap_rwsem);
+                                took_snap_rwsem = 1;
+                                goto retry;
+                        }
+                        took_snap_rwsem = 1;
+                }
+                if (cap == ci->i_auth_cap && ci->i_dirty_caps)
+                        flushing = __mark_caps_flushing(inode, session);
+                mds = cap->mds;  /* remember mds, so we don't repeat */
+                sent++;
+                /* __send_cap drops i_lock */
+                delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
+                                      retain, flushing, NULL);
+                goto retry; /* retake i_lock and restart our cap scan. */
+        }
+        /*
+         * Reschedule delayed caps release if we delayed anything,
+         * otherwise cancel.
+         */
+        if (delayed && is_delayed)
+                force_requeue = 1;   /* __send_cap delayed release; requeue */
+        if (!delayed && !is_delayed)
+                __cap_delay_cancel(mdsc, ci);
+        else if (!is_delayed || force_requeue)
+                __cap_delay_requeue(mdsc, ci);
+        spin_unlock(&inode->i_lock);
+        if (queue_invalidate)
+                ceph_queue_invalidate(inode);
+        if (session)
+                mutex_unlock(&session->s_mutex);
+        if (took_snap_rwsem)
+                up_read(&mdsc->snap_rwsem);
+}
+/*
+ * Try to flush dirty caps back to the auth mds.
+ */
+static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
+                          unsigned *flush_tid)
+{
+        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int unlock_session = session ? 0 : 1;
+        int flushing = 0;
+retry:
+        spin_lock(&inode->i_lock);
+        if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+                dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
+                goto out;
+        }
+        if (ci->i_dirty_caps && ci->i_auth_cap) {
+                struct ceph_cap *cap = ci->i_auth_cap;
+                int used = __ceph_caps_used(ci);
+                int want = __ceph_caps_wanted(ci);
+                int delayed;
+                if (!session) {
+                        spin_unlock(&inode->i_lock);
+                        session = cap->session;
+                        mutex_lock(&session->s_mutex);
+                        goto retry;
+                }
+                BUG_ON(session != cap->session);
+                if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
+                        goto out;
+                flushing = __mark_caps_flushing(inode, session);
+                /* __send_cap drops i_lock */
+                delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
+                                     cap->issued | cap->implemented, flushing,
+                                     flush_tid);
+                if (!delayed)
+                        goto out_unlocked;
+                spin_lock(&inode->i_lock);
+                __cap_delay_requeue(mdsc, ci);
+        }
+out:
+        spin_unlock(&inode->i_lock);
+out_unlocked:
+        if (session && unlock_session)
+                mutex_unlock(&session->s_mutex);
+        return flushing;
+}
+/*
+ * Return true if we've flushed caps through the given flush_tid.
+ */
+static int caps_are_flushed(struct inode *inode, unsigned tid)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int dirty, i, ret = 1;
+        spin_lock(&inode->i_lock);
+        dirty = __ceph_caps_dirty(ci);
+        for (i = 0; i < CEPH_CAP_BITS; i++)
+                if ((ci->i_flushing_caps & (1 << i)) &&
+                    ci->i_cap_flush_tid[i] <= tid) {
+                        /* still flushing this bit */
+                        ret = 0;
+                        break;
+                }
+        spin_unlock(&inode->i_lock);
+        return ret;
+}
+/*
+ * Wait on any unsafe replies for the given inode.  First wait on the
+ * newest request, and make that the upper bound.  Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+static void sync_write_wait(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct list_head *head = &ci->i_unsafe_writes;
+        struct ceph_osd_request *req;
+        u64 last_tid;
+        spin_lock(&ci->i_unsafe_lock);
+        if (list_empty(head))
+                goto out;
+        /* set upper bound as _last_ entry in chain */
+        req = list_entry(head->prev, struct ceph_osd_request,
+                         r_unsafe_item);
+        last_tid = req->r_tid;
+        do {
+                ceph_osdc_get_request(req);
+                spin_unlock(&ci->i_unsafe_lock);
+                dout("sync_write_wait on tid %llu (until %llu)\n",
+                     req->r_tid, last_tid);
+                wait_for_completion(&req->r_safe_completion);
+                spin_lock(&ci->i_unsafe_lock);
+                ceph_osdc_put_request(req);
+                /*
+                 * from here on look at first entry in chain, since we
+                 * only want to wait for anything older than last_tid
+                 */
+                if (list_empty(head))
+                        break;
+                req = list_entry(head->next, struct ceph_osd_request,
+                                 r_unsafe_item);
+        } while (req->r_tid < last_tid);
+out:
+        spin_unlock(&ci->i_unsafe_lock);
+}
+int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        unsigned flush_tid;
+        int ret;
+        int dirty;
+        dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+        sync_write_wait(inode);
+        ret = filemap_write_and_wait(inode->i_mapping);
+        if (ret < 0)
+                return ret;
+        dirty = try_flush_caps(inode, NULL, &flush_tid);
+        dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+        /*
+         * only wait on non-file metadata writeback (the mds
+         * can recover size and mtime, so we don't need to
+         * wait for that)
+         */
+        if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+                dout("fsync waiting for flush_tid %u\n", flush_tid);
+                ret = wait_event_interruptible(ci->i_cap_wq,
+                                       caps_are_flushed(inode, flush_tid));
+        }
+        dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
+        return ret;
+}
+/*
+ * Flush any dirty caps back to the mds.  If we aren't asked to wait,
+ * queue inode for flush but don't do so immediately, because we can
+ * get by with fewer MDS messages if we wait for data writeback to
+ * complete first.
+ */
+int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        unsigned flush_tid;
+        int err = 0;
+        int dirty;
+        int wait = wbc->sync_mode == WB_SYNC_ALL;
+        dout("write_inode %p wait=%d\n", inode, wait);
+        if (wait) {
+                dirty = try_flush_caps(inode, NULL, &flush_tid);
+                if (dirty)
+                        err = wait_event_interruptible(ci->i_cap_wq,
+                                       caps_are_flushed(inode, flush_tid));
+        } else {
+                struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+                spin_lock(&inode->i_lock);
+                if (__ceph_caps_dirty(ci))
+                        __cap_delay_requeue_front(mdsc, ci);
+                spin_unlock(&inode->i_lock);
+        }
+        return err;
+}
+/*
+ * After a recovering MDS goes active, we need to resend any caps
+ * we were flushing.
+ *
+ * Caller holds session->s_mutex.
+ */
+static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
+                                   struct ceph_mds_session *session)
+{
+        struct ceph_cap_snap *capsnap;
+        dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
+        list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
+                            flushing_item) {
+                struct ceph_inode_info *ci = capsnap->ci;
+                struct inode *inode = &ci->vfs_inode;
+                struct ceph_cap *cap;
+                spin_lock(&inode->i_lock);
+                cap = ci->i_auth_cap;
+                if (cap && cap->session == session) {
+                        dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
+                             cap, capsnap);
+                        __ceph_flush_snaps(ci, &session);
+                } else {
+                        pr_err("%p auth cap %p not mds%d ???\n", inode,
+                               cap, session->s_mds);
+                }
+                spin_unlock(&inode->i_lock);
+        }
+}
+void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+                             struct ceph_mds_session *session)
+{
+        struct ceph_inode_info *ci;
+        kick_flushing_capsnaps(mdsc, session);
+        dout("kick_flushing_caps mds%d\n", session->s_mds);
+        list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+                struct inode *inode = &ci->vfs_inode;
+                struct ceph_cap *cap;
+                int delayed = 0;
+                spin_lock(&inode->i_lock);
+                cap = ci->i_auth_cap;
+                if (cap && cap->session == session) {
+                        dout("kick_flushing_caps %p cap %p %s\n", inode,
+                             cap, ceph_cap_string(ci->i_flushing_caps));
+                        delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+                                             __ceph_caps_used(ci),
+                                             __ceph_caps_wanted(ci),
+                                             cap->issued | cap->implemented,
+                                             ci->i_flushing_caps, NULL);
+                        if (delayed) {
+                                spin_lock(&inode->i_lock);
+                                __cap_delay_requeue(mdsc, ci);
+                                spin_unlock(&inode->i_lock);
+                        }
+                } else {
+                        pr_err("%p auth cap %p not mds%d ???\n", inode,
+                               cap, session->s_mds);
+                        spin_unlock(&inode->i_lock);
+                }
+        }
+}
+/*
+ * Take references to capabilities we hold, so that we don't release
+ * them to the MDS prematurely.
+ *
+ * Protected by i_lock.
+ */
+static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+{
+        if (got & CEPH_CAP_PIN)
+                ci->i_pin_ref++;
+        if (got & CEPH_CAP_FILE_RD)
+                ci->i_rd_ref++;
+        if (got & CEPH_CAP_FILE_CACHE)
+                ci->i_rdcache_ref++;
+        if (got & CEPH_CAP_FILE_WR)
+                ci->i_wr_ref++;
+        if (got & CEPH_CAP_FILE_BUFFER) {
+                if (ci->i_wrbuffer_ref == 0)
+                        igrab(&ci->vfs_inode);
+                ci->i_wrbuffer_ref++;
+                dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
+                     &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
+        }
+}
+/*
+ * Try to grab cap references.  Specify those refs we @want, and the
+ * minimal set we @need.  Also include the larger offset we are writing
+ * to (when applicable), and check against max_size here as well.
+ * Note that caller is responsible for ensuring max_size increases are
+ * requested from the MDS.
+ */
+static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
+                            int *got, loff_t endoff, int *check_max, int *err)
+{
+        struct inode *inode = &ci->vfs_inode;
+        int ret = 0;
+        int have, implemented;
+        int file_wanted;
+        dout("get_cap_refs %p need %s want %s\n", inode,
+             ceph_cap_string(need), ceph_cap_string(want));
+        spin_lock(&inode->i_lock);
+        /* make sure file is actually open */
+        file_wanted = __ceph_caps_file_wanted(ci);
+        if ((file_wanted & need) == 0) {
+                dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
+                     ceph_cap_string(need), ceph_cap_string(file_wanted));
+                *err = -EBADF;
+                ret = 1;
+                goto out;
+        }
+        if (need & CEPH_CAP_FILE_WR) {
+                if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
+                        dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
+                             inode, endoff, ci->i_max_size);
+                        if (endoff > ci->i_wanted_max_size) {
+                                *check_max = 1;
+                                ret = 1;
+                        }
+                        goto out;
+                }
+                /*
+                 * If a sync write is in progress, we must wait, so that we
+                 * can get a final snapshot value for size+mtime.
+                 */
+                if (__ceph_have_pending_cap_snap(ci)) {
+                        dout("get_cap_refs %p cap_snap_pending\n", inode);
+                        goto out;
+                }
+        }
+        have = __ceph_caps_issued(ci, &implemented);
+        /*
+         * disallow writes while a truncate is pending
+         */
+        if (ci->i_truncate_pending)
+                have &= ~CEPH_CAP_FILE_WR;
+        if ((have & need) == need) {
+                /*
+                 * Look at (implemented & ~have & not) so that we keep waiting
+                 * on transition from wanted -> needed caps.  This is needed
+                 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
+                 * going before a prior buffered writeback happens.
+                 */
+                int not = want & ~(have & need);
+                int revoking = implemented & ~have;
+                dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
+                     inode, ceph_cap_string(have), ceph_cap_string(not),
+                     ceph_cap_string(revoking));
+                if ((revoking & not) == 0) {
+                        *got = need | (have & want);
+                        __take_cap_refs(ci, *got);
+                        ret = 1;
+                }
+        } else {
+                dout("get_cap_refs %p have %s needed %s\n", inode,
+                     ceph_cap_string(have), ceph_cap_string(need));
+        }
+out:
+        spin_unlock(&inode->i_lock);
+        dout("get_cap_refs %p ret %d got %s\n", inode,
+             ret, ceph_cap_string(*got));
+        return ret;
+}
+/*
+ * Check the offset we are writing up to against our current
+ * max_size.  If necessary, tell the MDS we want to write to
+ * a larger offset.
+ */
+static void check_max_size(struct inode *inode, loff_t endoff)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int check = 0;
+        /* do we need to explicitly request a larger max_size? */
+        spin_lock(&inode->i_lock);
+        if ((endoff >= ci->i_max_size ||
+             endoff > (inode->i_size << 1)) &&
+            endoff > ci->i_wanted_max_size) {
+                dout("write %p at large endoff %llu, req max_size\n",
+                     inode, endoff);
+                ci->i_wanted_max_size = endoff;
+                check = 1;
+        }
+        spin_unlock(&inode->i_lock);
+        if (check)
+                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+}
+/*
+ * Wait for caps, and take cap references.  If we can't get a WR cap
+ * due to a small max_size, make sure we check_max_size (and possibly
+ * ask the mds) so we don't get hung up indefinitely.
+ */
+int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
+                  loff_t endoff)
+{
+        int check_max, ret, err;
+retry:
+        if (endoff > 0)
+                check_max_size(&ci->vfs_inode, endoff);
+        check_max = 0;
+        err = 0;
+        ret = wait_event_interruptible(ci->i_cap_wq,
+                                       try_get_cap_refs(ci, need, want,
+                                                        got, endoff,
+                                                        &check_max, &err));
+        if (err)
+                ret = err;
+        if (check_max)
+                goto retry;
+        return ret;
+}
+/*
+ * Take cap refs.  Caller must already know we hold at least one ref
+ * on the caps in question or we don't know this is safe.
+ */
+void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
+{
+        spin_lock(&ci->vfs_inode.i_lock);
+        __take_cap_refs(ci, caps);
+        spin_unlock(&ci->vfs_inode.i_lock);
+}
+/*
+ * Release cap refs.
+ *
+ * If we released the last ref on any given cap, call ceph_check_caps
+ * to release (or schedule a release).
+ *
+ * If we are releasing a WR cap (from a sync write), finalize any affected
+ * cap_snap, and wake up any waiters.
+ */
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+        struct inode *inode = &ci->vfs_inode;
+        int last = 0, put = 0, flushsnaps = 0, wake = 0;
+        struct ceph_cap_snap *capsnap;
+        spin_lock(&inode->i_lock);
+        if (had & CEPH_CAP_PIN)
+                --ci->i_pin_ref;
+        if (had & CEPH_CAP_FILE_RD)
+                if (--ci->i_rd_ref == 0)
+                        last++;
+        if (had & CEPH_CAP_FILE_CACHE)
+                if (--ci->i_rdcache_ref == 0)
+                        last++;
+        if (had & CEPH_CAP_FILE_BUFFER) {
+                if (--ci->i_wrbuffer_ref == 0) {
+                        last++;
+                        put++;
+                }
+                dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
+                     inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
+        }
+        if (had & CEPH_CAP_FILE_WR)
+                if (--ci->i_wr_ref == 0) {
+                        last++;
+                        if (!list_empty(&ci->i_cap_snaps)) {
+                                capsnap = list_first_entry(&ci->i_cap_snaps,
+                                                     struct ceph_cap_snap,
+                                                     ci_item);
+                                if (capsnap->writing) {
+                                        capsnap->writing = 0;
+                                        flushsnaps =
+                                                __ceph_finish_cap_snap(ci,
+                                                                       capsnap);
+                                        wake = 1;
+                                }
+                        }
+                }
+        spin_unlock(&inode->i_lock);
+        dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
+             last ? " last" : "", put ? " put" : "");
+        if (last && !flushsnaps)
+                ceph_check_caps(ci, 0, NULL);
+        else if (flushsnaps)
+                ceph_flush_snaps(ci);
+        if (wake)
+                wake_up(&ci->i_cap_wq);
+        if (put)
+                iput(inode);
+}
+/*
+ * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
+ * context.  Adjust per-snap dirty page accounting as appropriate.
+ * Once all dirty data for a cap_snap is flushed, flush snapped file
+ * metadata back to the MDS.  If we dropped the last ref, call
+ * ceph_check_caps.
+ */
+void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+                                struct ceph_snap_context *snapc)
+{
+        struct inode *inode = &ci->vfs_inode;
+        int last = 0;
+        int complete_capsnap = 0;
+        int drop_capsnap = 0;
+        int found = 0;
+        struct ceph_cap_snap *capsnap = NULL;
+        spin_lock(&inode->i_lock);
+        ci->i_wrbuffer_ref -= nr;
+        last = !ci->i_wrbuffer_ref;
+        if (ci->i_head_snapc == snapc) {
+                ci->i_wrbuffer_ref_head -= nr;
+                if (!ci->i_wrbuffer_ref_head) {
+                        ceph_put_snap_context(ci->i_head_snapc);
+                        ci->i_head_snapc = NULL;
+                }
+                dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
+                     inode,
+                     ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
+                     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+                     last ? " LAST" : "");
+        } else {
+                list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+                        if (capsnap->context == snapc) {
+                                found = 1;
+                                break;
+                        }
+                }
+                BUG_ON(!found);
+                capsnap->dirty_pages -= nr;
+                if (capsnap->dirty_pages == 0) {
+                        complete_capsnap = 1;
+                        if (capsnap->dirty == 0)
+                                /* cap writeback completed before we created
+                                 * the cap_snap; no FLUSHSNAP is needed */
+                                drop_capsnap = 1;
+                }
+                dout("put_wrbuffer_cap_refs on %p cap_snap %p "
+                     " snap %lld %d/%d -> %d/%d %s%s%s\n",
+                     inode, capsnap, capsnap->context->seq,
+                     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+                     ci->i_wrbuffer_ref, capsnap->dirty_pages,
+                     last ? " (wrbuffer last)" : "",
+                     complete_capsnap ? " (complete capsnap)" : "",
+                     drop_capsnap ? " (drop capsnap)" : "");
+                if (drop_capsnap) {
+                        ceph_put_snap_context(capsnap->context);
+                        list_del(&capsnap->ci_item);
+                        list_del(&capsnap->flushing_item);
+                        ceph_put_cap_snap(capsnap);
+                }
+        }
+        spin_unlock(&inode->i_lock);
+        if (last) {
+                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+                iput(inode);
+        } else if (complete_capsnap) {
+                ceph_flush_snaps(ci);
+                wake_up(&ci->i_cap_wq);
+        }
+        if (drop_capsnap)
+                iput(inode);
+}
+/*
+ * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
+ * actually be a revocation if it specifies a smaller cap set.)
+ *
+ * caller holds s_mutex and i_lock, we drop both.
+ *
+ * return value:
+ *  0 - ok
+ *  1 - check_caps on auth cap only (writeback)
+ *  2 - check_caps (ack revoke)
+ */
+static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+                             struct ceph_mds_session *session,
+                             struct ceph_cap *cap,
+                             struct ceph_buffer *xattr_buf)
+        __releases(inode->i_lock)
+        __releases(session->s_mutex)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int mds = session->s_mds;
+        int seq = le32_to_cpu(grant->seq);
+        int newcaps = le32_to_cpu(grant->caps);
+        int issued, implemented, used, wanted, dirty;
+        u64 size = le64_to_cpu(grant->size);
+        u64 max_size = le64_to_cpu(grant->max_size);
+        struct timespec mtime, atime, ctime;
+        int check_caps = 0;
+        int wake = 0;
+        int writeback = 0;
+        int revoked_rdcache = 0;
+        int queue_invalidate = 0;
+        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+             inode, cap, mds, seq, ceph_cap_string(newcaps));
+        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
+                inode->i_size);
+        /*
+         * If CACHE is being revoked, and we have no dirty buffers,
+         * try to invalidate (once).  (If there are dirty buffers, we
+         * will invalidate _after_ writeback.)
+         */
+        if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+            !ci->i_wrbuffer_ref) {
+                if (try_nonblocking_invalidate(inode) == 0) {
+                        revoked_rdcache = 1;
+                } else {
+                        /* there were locked pages.. invalidate later
+                           in a separate thread. */
+                        if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+                                queue_invalidate = 1;
+                                ci->i_rdcache_revoking = ci->i_rdcache_gen;
+                        }
+                }
+        }
+        /* side effects now are allowed */
+        issued = __ceph_caps_issued(ci, &implemented);
+        issued |= implemented | __ceph_caps_dirty(ci);
+        cap->cap_gen = session->s_cap_gen;
+        __check_cap_issue(ci, cap, newcaps);
+        if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+                inode->i_mode = le32_to_cpu(grant->mode);
+                inode->i_uid = le32_to_cpu(grant->uid);
+                inode->i_gid = le32_to_cpu(grant->gid);
+                dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+                     inode->i_uid, inode->i_gid);
+        }
+        if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+                inode->i_nlink = le32_to_cpu(grant->nlink);
+        if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
+                int len = le32_to_cpu(grant->xattr_len);
+                u64 version = le64_to_cpu(grant->xattr_version);
+                if (version > ci->i_xattrs.version) {
+                        dout(" got new xattrs v%llu on %p len %d\n",
+                             version, inode, len);
+                        if (ci->i_xattrs.blob)
+                                ceph_buffer_put(ci->i_xattrs.blob);
+                        ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
+                        ci->i_xattrs.version = version;
+                }
+        }
+        /* size/ctime/mtime/atime? */
+        ceph_fill_file_size(inode, issued,
+                            le32_to_cpu(grant->truncate_seq),
+                            le64_to_cpu(grant->truncate_size), size);
+        ceph_decode_timespec(&mtime, &grant->mtime);
+        ceph_decode_timespec(&atime, &grant->atime);
+        ceph_decode_timespec(&ctime, &grant->ctime);
+        ceph_fill_file_time(inode, issued,
+                            le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
+                            &atime);
+        /* max size increase? */
+        if (max_size != ci->i_max_size) {
+                dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
+                ci->i_max_size = max_size;
+                if (max_size >= ci->i_wanted_max_size) {
+                        ci->i_wanted_max_size = 0;  /* reset */
+                        ci->i_requested_max_size = 0;
+                }
+                wake = 1;
+        }
+        /* check cap bits */
+        wanted = __ceph_caps_wanted(ci);
+        used = __ceph_caps_used(ci);
+        dirty = __ceph_caps_dirty(ci);
+        dout(" my wanted = %s, used = %s, dirty %s\n",
+             ceph_cap_string(wanted),
+             ceph_cap_string(used),
+             ceph_cap_string(dirty));
+        if (wanted != le32_to_cpu(grant->wanted)) {
+                dout("mds wanted %s -> %s\n",
+                     ceph_cap_string(le32_to_cpu(grant->wanted)),
+                     ceph_cap_string(wanted));
+                grant->wanted = cpu_to_le32(wanted);
+        }
+        cap->seq = seq;
+        /* file layout may have changed */
+        ci->i_layout = grant->layout;
+        /* revocation, grant, or no-op? */
+        if (cap->issued & ~newcaps) {
+                dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
+                     ceph_cap_string(newcaps));
+                if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
+                        writeback = 1; /* will delay ack */
+                else if (dirty & ~newcaps)
+                        check_caps = 1;  /* initiate writeback in check_caps */
+                else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
+                           revoked_rdcache)
+                        check_caps = 2;     /* send revoke ack in check_caps */
+                cap->issued = newcaps;
+                cap->implemented |= newcaps;
+        } else if (cap->issued == newcaps) {
+                dout("caps unchanged: %s -> %s\n",
+                     ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+        } else {
+                dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
+                     ceph_cap_string(newcaps));
+                cap->issued = newcaps;
+                cap->implemented |= newcaps; /* add bits only, to
+                                              * avoid stepping on a
+                                              * pending revocation */
+                wake = 1;
+        }
+        BUG_ON(cap->issued & ~cap->implemented);
+        spin_unlock(&inode->i_lock);
+        if (writeback)
+                /*
+                 * queue inode for writeback: we can't actually call
+                 * filemap_write_and_wait, etc. from message handler
+                 * context.
+                 */
+                ceph_queue_writeback(inode);
+        if (queue_invalidate)
+                ceph_queue_invalidate(inode);
+        if (wake)
+                wake_up(&ci->i_cap_wq);
+        if (check_caps == 1)
+                ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+                                session);
+        else if (check_caps == 2)
+                ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+        else
+                mutex_unlock(&session->s_mutex);
+}
+/*
+ * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
+ * MDS has been safely committed.
+ */
+static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
+                                 struct ceph_mds_caps *m,
+                                 struct ceph_mds_session *session,
+                                 struct ceph_cap *cap)
+        __releases(inode->i_lock)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        unsigned seq = le32_to_cpu(m->seq);
+        int dirty = le32_to_cpu(m->dirty);
+        int cleaned = 0;
+        int drop = 0;
+        int i;
+        for (i = 0; i < CEPH_CAP_BITS; i++)
+                if ((dirty & (1 << i)) &&
+                    flush_tid == ci->i_cap_flush_tid[i])
+                        cleaned |= 1 << i;
+        dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
+             " flushing %s -> %s\n",
+             inode, session->s_mds, seq, ceph_cap_string(dirty),
+             ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
+             ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+        if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+                goto out;
+        ci->i_flushing_caps &= ~cleaned;
+        spin_lock(&mdsc->cap_dirty_lock);
+        if (ci->i_flushing_caps == 0) {
+                list_del_init(&ci->i_flushing_item);
+                if (!list_empty(&session->s_cap_flushing))
+                        dout(" mds%d still flushing cap on %p\n",
+                             session->s_mds,
+                             &list_entry(session->s_cap_flushing.next,
+                                         struct ceph_inode_info,
+                                         i_flushing_item)->vfs_inode);
+                mdsc->num_cap_flushing--;
+                wake_up(&mdsc->cap_flushing_wq);
+                dout(" inode %p now !flushing\n", inode);
+                if (ci->i_dirty_caps == 0) {
+                        dout(" inode %p now clean\n", inode);
+                        BUG_ON(!list_empty(&ci->i_dirty_item));
+                        drop = 1;
+                } else {
+                        BUG_ON(list_empty(&ci->i_dirty_item));
+                }
+        }
+        spin_unlock(&mdsc->cap_dirty_lock);
+        wake_up(&ci->i_cap_wq);
+out:
+        spin_unlock(&inode->i_lock);
+        if (drop)
+                iput(inode);
+}
+/*
+ * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
+ * throw away our cap_snap.
+ *
+ * Caller hold s_mutex.
+ */
+static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
+                                     struct ceph_mds_caps *m,
+                                     struct ceph_mds_session *session)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        u64 follows = le64_to_cpu(m->snap_follows);
+        struct ceph_cap_snap *capsnap;
+        int drop = 0;
+        dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
+             inode, ci, session->s_mds, follows);
+        spin_lock(&inode->i_lock);
+        list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+                if (capsnap->follows == follows) {
+                        if (capsnap->flush_tid != flush_tid) {
+                                dout(" cap_snap %p follows %lld tid %lld !="
+                                     " %lld\n", capsnap, follows,
+                                     flush_tid, capsnap->flush_tid);
+                                break;
+                        }
+                        WARN_ON(capsnap->dirty_pages || capsnap->writing);
+                        dout(" removing %p cap_snap %p follows %lld\n",
+                             inode, capsnap, follows);
+                        ceph_put_snap_context(capsnap->context);
+                        list_del(&capsnap->ci_item);
+                        list_del(&capsnap->flushing_item);
+                        ceph_put_cap_snap(capsnap);
+                        drop = 1;
+                        break;
+                } else {
+                        dout(" skipping cap_snap %p follows %lld\n",
+                             capsnap, capsnap->follows);
+                }
+        }
+        spin_unlock(&inode->i_lock);
+        if (drop)
+                iput(inode);
+}
+/*
+ * Handle TRUNC from MDS, indicating file truncation.
+ *
+ * caller hold s_mutex.
+ */
+static void handle_cap_trunc(struct inode *inode,
+                             struct ceph_mds_caps *trunc,
+                             struct ceph_mds_session *session)
+        __releases(inode->i_lock)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int mds = session->s_mds;
+        int seq = le32_to_cpu(trunc->seq);
+        u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
+        u64 truncate_size = le64_to_cpu(trunc->truncate_size);
+        u64 size = le64_to_cpu(trunc->size);
+        int implemented = 0;
+        int dirty = __ceph_caps_dirty(ci);
+        int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
+        int queue_trunc = 0;
+        issued |= implemented | dirty;
+        dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
+             inode, mds, seq, truncate_size, truncate_seq);
+        queue_trunc = ceph_fill_file_size(inode, issued,
+                                          truncate_seq, truncate_size, size);
+        spin_unlock(&inode->i_lock);
+        if (queue_trunc)
+                ceph_queue_vmtruncate(inode);
+}
+/*
+ * Handle EXPORT from MDS.  Cap is being migrated _from_ this mds to a
+ * different one.  If we are the most recent migration we've seen (as
+ * indicated by mseq), make note of the migrating cap bits for the
+ * duration (until we see the corresponding IMPORT).
+ *
+ * caller holds s_mutex
+ */
+static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
+                              struct ceph_mds_session *session)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int mds = session->s_mds;
+        unsigned mseq = le32_to_cpu(ex->migrate_seq);
+        struct ceph_cap *cap = NULL, *t;
+        struct rb_node *p;
+        int remember = 1;
+        dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
+             inode, ci, mds, mseq);
+        spin_lock(&inode->i_lock);
+        /* make sure we haven't seen a higher mseq */
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                t = rb_entry(p, struct ceph_cap, ci_node);
+                if (ceph_seq_cmp(t->mseq, mseq) > 0) {
+                        dout(" higher mseq on cap from mds%d\n",
+                             t->session->s_mds);
+                        remember = 0;
+                }
+                if (t->session->s_mds == mds)
+                        cap = t;
+        }
+        if (cap) {
+                if (remember) {
+                        /* make note */
+                        ci->i_cap_exporting_mds = mds;
+                        ci->i_cap_exporting_mseq = mseq;
+                        ci->i_cap_exporting_issued = cap->issued;
+                }
+                __ceph_remove_cap(cap);
+        }
+        /* else, we already released it */
+        spin_unlock(&inode->i_lock);
+}
+/*
+ * Handle cap IMPORT.  If there are temp bits from an older EXPORT,
+ * clean them up.
+ *
+ * caller holds s_mutex.
+ */
+static void handle_cap_import(struct ceph_mds_client *mdsc,
+                              struct inode *inode, struct ceph_mds_caps *im,
+                              struct ceph_mds_session *session,
+                              void *snaptrace, int snaptrace_len)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int mds = session->s_mds;
+        unsigned issued = le32_to_cpu(im->caps);
+        unsigned wanted = le32_to_cpu(im->wanted);
+        unsigned seq = le32_to_cpu(im->seq);
+        unsigned mseq = le32_to_cpu(im->migrate_seq);
+        u64 realmino = le64_to_cpu(im->realm);
+        u64 cap_id = le64_to_cpu(im->cap_id);
+        if (ci->i_cap_exporting_mds >= 0 &&
+            ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
+                dout("handle_cap_import inode %p ci %p mds%d mseq %d"
+                     " - cleared exporting from mds%d\n",
+                     inode, ci, mds, mseq,
+                     ci->i_cap_exporting_mds);
+                ci->i_cap_exporting_issued = 0;
+                ci->i_cap_exporting_mseq = 0;
+                ci->i_cap_exporting_mds = -1;
+        } else {
+                dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
+                     inode, ci, mds, mseq);
+        }
+        down_write(&mdsc->snap_rwsem);
+        ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
+                               false);
+        downgrade_write(&mdsc->snap_rwsem);
+        ceph_add_cap(inode, session, cap_id, -1,
+                     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
+                     NULL /* no caps context */);
+        try_flush_caps(inode, session, NULL);
+        up_read(&mdsc->snap_rwsem);
+}
+/*
+ * Handle a caps message from the MDS.
+ *
+ * Identify the appropriate session, inode, and call the right handler
+ * based on the cap op.
+ */
+void ceph_handle_caps(struct ceph_mds_session *session,
+                      struct ceph_msg *msg)
+{
+        struct ceph_mds_client *mdsc = session->s_mdsc;
+        struct super_block *sb = mdsc->client->sb;
+        struct inode *inode;
+        struct ceph_cap *cap;
+        struct ceph_mds_caps *h;
+        int mds = session->s_mds;
+        int op;
+        u32 seq;
+        struct ceph_vino vino;
+        u64 cap_id;
+        u64 size, max_size;
+        u64 tid;
+        void *snaptrace;
+        dout("handle_caps from mds%d\n", mds);
+        /* decode */
+        tid = le64_to_cpu(msg->hdr.tid);
+        if (msg->front.iov_len < sizeof(*h))
+                goto bad;
+        h = msg->front.iov_base;
+        snaptrace = h + 1;
+        op = le32_to_cpu(h->op);
+        vino.ino = le64_to_cpu(h->ino);
+        vino.snap = CEPH_NOSNAP;
+        cap_id = le64_to_cpu(h->cap_id);
+        seq = le32_to_cpu(h->seq);
+        size = le64_to_cpu(h->size);
+        max_size = le64_to_cpu(h->max_size);
+        mutex_lock(&session->s_mutex);
+        session->s_seq++;
+        dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
+             (unsigned)seq);
+        /* lookup ino */
+        inode = ceph_find_inode(sb, vino);
+        dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
+             vino.snap, inode);
+        if (!inode) {
+                dout(" i don't have ino %llx\n", vino.ino);
+                goto done;
+        }
+        /* these will work even if we don't have a cap yet */
+        switch (op) {
+        case CEPH_CAP_OP_FLUSHSNAP_ACK:
+                handle_cap_flushsnap_ack(inode, tid, h, session);
+                goto done;
+        case CEPH_CAP_OP_EXPORT:
+                handle_cap_export(inode, h, session);
+                goto done;
+        case CEPH_CAP_OP_IMPORT:
+                handle_cap_import(mdsc, inode, h, session,
+                                  snaptrace, le32_to_cpu(h->snap_trace_len));
+                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
+                                session);
+                goto done_unlocked;
+        }
+        /* the rest require a cap */
+        spin_lock(&inode->i_lock);
+        cap = __get_cap_for_mds(ceph_inode(inode), mds);
+        if (!cap) {
+                dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
+                     inode, ceph_ino(inode), ceph_snap(inode), mds);
+                spin_unlock(&inode->i_lock);
+                goto done;
+        }
+        /* note that each of these drops i_lock for us */
+        switch (op) {
+        case CEPH_CAP_OP_REVOKE:
+        case CEPH_CAP_OP_GRANT:
+                handle_cap_grant(inode, h, session, cap, msg->middle);
+                goto done_unlocked;
+        case CEPH_CAP_OP_FLUSH_ACK:
+                handle_cap_flush_ack(inode, tid, h, session, cap);
+                break;
+        case CEPH_CAP_OP_TRUNC:
+                handle_cap_trunc(inode, h, session);
+                break;
+        default:
+                spin_unlock(&inode->i_lock);
+                pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
+                       ceph_cap_op_name(op));
+        }
+done:
+        mutex_unlock(&session->s_mutex);
+done_unlocked:
+        if (inode)
+                iput(inode);
+        return;
+bad:
+        pr_err("ceph_handle_caps: corrupt message\n");
+        ceph_msg_dump(msg);
+        return;
+}
+/*
+ * Delayed work handler to process end of delayed cap release LRU list.
+ */
+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+{
+        struct ceph_inode_info *ci;
+        int flags = CHECK_CAPS_NODELAY;
+        dout("check_delayed_caps\n");
+        while (1) {
+                spin_lock(&mdsc->cap_delay_lock);
+                if (list_empty(&mdsc->cap_delay_list))
+                        break;
+                ci = list_first_entry(&mdsc->cap_delay_list,
+                                      struct ceph_inode_info,
+                                      i_cap_delay_list);
+                if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
+                    time_before(jiffies, ci->i_hold_caps_max))
+                        break;
+                list_del_init(&ci->i_cap_delay_list);
+                spin_unlock(&mdsc->cap_delay_lock);
+                dout("check_delayed_caps on %p\n", &ci->vfs_inode);
+                ceph_check_caps(ci, flags, NULL);
+        }
+        spin_unlock(&mdsc->cap_delay_lock);
+}
+/*
+ * Flush all dirty caps to the mds
+ */
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+{
+        struct ceph_inode_info *ci, *nci = NULL;
+        struct inode *inode, *ninode = NULL;
+        struct list_head *p, *n;
+        dout("flush_dirty_caps\n");
+        spin_lock(&mdsc->cap_dirty_lock);
+        list_for_each_safe(p, n, &mdsc->cap_dirty) {
+                if (nci) {
+                        ci = nci;
+                        inode = ninode;
+                        ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
+                        dout("flush_dirty_caps inode %p (was next inode)\n",
+                             inode);
+                } else {
+                        ci = list_entry(p, struct ceph_inode_info,
+                                        i_dirty_item);
+                        inode = igrab(&ci->vfs_inode);
+                        BUG_ON(!inode);
+                        dout("flush_dirty_caps inode %p\n", inode);
+                }
+                if (n != &mdsc->cap_dirty) {
+                        nci = list_entry(n, struct ceph_inode_info,
+                                         i_dirty_item);
+                        ninode = igrab(&nci->vfs_inode);
+                        BUG_ON(!ninode);
+                        nci->i_ceph_flags |= CEPH_I_NOFLUSH;
+                        dout("flush_dirty_caps next inode %p, noflush\n",
+                             ninode);
+                } else {
+                        nci = NULL;
+                        ninode = NULL;
+                }
+                spin_unlock(&mdsc->cap_dirty_lock);
+                if (inode) {
+                        ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
+                                        NULL);
+                        iput(inode);
+                }
+                spin_lock(&mdsc->cap_dirty_lock);
+        }
+        spin_unlock(&mdsc->cap_dirty_lock);
+}
+/*
+ * Drop open file reference.  If we were the last open file,
+ * we may need to release capabilities to the MDS (or schedule
+ * their delayed release).
+ */
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+{
+        struct inode *inode = &ci->vfs_inode;
+        int last = 0;
+        spin_lock(&inode->i_lock);
+        dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
+             ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
+        BUG_ON(ci->i_nr_by_mode[fmode] == 0);
+        if (--ci->i_nr_by_mode[fmode] == 0)
+                last++;
+        spin_unlock(&inode->i_lock);
+        if (last && ci->i_vino.snap == CEPH_NOSNAP)
+                ceph_check_caps(ci, 0, NULL);
+}
+/*
+ * Helpers for embedding cap and dentry lease releases into mds
+ * requests.
+ *
+ * @force is used by dentry_release (below) to force inclusion of a
+ * record for the directory inode, even when there aren't any caps to
+ * drop.
+ */
+int ceph_encode_inode_release(void **p, struct inode *inode,
+                              int mds, int drop, int unless, int force)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_cap *cap;
+        struct ceph_mds_request_release *rel = *p;
+        int ret = 0;
+        int used = 0;
+        spin_lock(&inode->i_lock);
+        used = __ceph_caps_used(ci);
+        dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
+             mds, ceph_cap_string(used), ceph_cap_string(drop),
+             ceph_cap_string(unless));
+        /* only drop unused caps */
+        drop &= ~used;
+        cap = __get_cap_for_mds(ci, mds);
+        if (cap && __cap_is_valid(cap)) {
+                if (force ||
+                    ((cap->issued & drop) &&
+                     (cap->issued & unless) == 0)) {
+                        if ((cap->issued & drop) &&
+                            (cap->issued & unless) == 0) {
+                                dout("encode_inode_release %p cap %p %s -> "
+                                     "%s\n", inode, cap,
+                                     ceph_cap_string(cap->issued),
+                                     ceph_cap_string(cap->issued & ~drop));
+                                cap->issued &= ~drop;
+                                cap->implemented &= ~drop;
+                                if (ci->i_ceph_flags & CEPH_I_NODELAY) {
+                                        int wanted = __ceph_caps_wanted(ci);
+                                        dout("  wanted %s -> %s (act %s)\n",
+                                             ceph_cap_string(cap->mds_wanted),
+                                             ceph_cap_string(cap->mds_wanted &
+                                                             ~wanted),
+                                             ceph_cap_string(wanted));
+                                        cap->mds_wanted &= wanted;
+                                }
+                        } else {
+                                dout("encode_inode_release %p cap %p %s"
+                                     " (force)\n", inode, cap,
+                                     ceph_cap_string(cap->issued));
+                        }
+                        rel->ino = cpu_to_le64(ceph_ino(inode));
+                        rel->cap_id = cpu_to_le64(cap->cap_id);
+                        rel->seq = cpu_to_le32(cap->seq);
+                        rel->issue_seq = cpu_to_le32(cap->issue_seq),
+                        rel->mseq = cpu_to_le32(cap->mseq);
+                        rel->caps = cpu_to_le32(cap->issued);
+                        rel->wanted = cpu_to_le32(cap->mds_wanted);
+                        rel->dname_len = 0;
+                        rel->dname_seq = 0;
+                        *p += sizeof(*rel);
+                        ret = 1;
+                } else {
+                        dout("encode_inode_release %p cap %p %s\n",
+                             inode, cap, ceph_cap_string(cap->issued));
+                }
+        }
+        spin_unlock(&inode->i_lock);
+        return ret;
+}
+int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+                               int mds, int drop, int unless)
+{
+        struct inode *dir = dentry->d_parent->d_inode;
+        struct ceph_mds_request_release *rel = *p;
+        struct ceph_dentry_info *di = ceph_dentry(dentry);
+        int force = 0;
+        int ret;
+        /*
+         * force an record for the directory caps if we have a dentry lease.
+         * this is racy (can't take i_lock and d_lock together), but it
+         * doesn't have to be perfect; the mds will revoke anything we don't
+         * release.
+         */
+        spin_lock(&dentry->d_lock);
+        if (di->lease_session && di->lease_session->s_mds == mds)
+                force = 1;
+        spin_unlock(&dentry->d_lock);
+        ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+        spin_lock(&dentry->d_lock);
+        if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+                dout("encode_dentry_release %p mds%d seq %d\n",
+                     dentry, mds, (int)di->lease_seq);
+                rel->dname_len = cpu_to_le32(dentry->d_name.len);
+                memcpy(*p, dentry->d_name.name, dentry->d_name.len);
+                *p += dentry->d_name.len;
+                rel->dname_seq = cpu_to_le32(di->lease_seq);
+        }
+        spin_unlock(&dentry->d_lock);
+        return ret;
+}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
+#ifndef _FS_CEPH_DEBUG_H
+#define _FS_CEPH_DEBUG_H
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
+/*
+ * wrap pr_debug to include a filename:lineno prefix on each line.
+ * this incurs some overhead (kernel size and execution time) due to
+ * the extra function call at each call site.
+ */
+# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+extern const char *ceph_file_part(const char *s, int len);
+#  define dout(fmt, ...)                                                \
+        pr_debug(" %12.12s:%-4d : " fmt,                                \
+                 ceph_file_part(__FILE__, sizeof(__FILE__)),            \
+                 __LINE__, ##__VA_ARGS__)
+# else
+/* faux printk call just to see any compiler warnings. */
+#  define dout(fmt, ...)        do {                            \
+                if (0)                                          \
+                        printk(KERN_DEBUG fmt, ##__VA_ARGS__);  \
+        } while (0)
+# endif
+#else
+/*
+ * or, just wrap pr_debug
+ */
+# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
+#endif
+#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
+/*
+ * Ceph 'frag' type
+ */
+#include "types.h"
+int ceph_frag_compare(__u32 a, __u32 b)
+{
+        unsigned va = ceph_frag_value(a);
+        unsigned vb = ceph_frag_value(b);
+        if (va < vb)
+                return -1;
+        if (va > vb)
+                return 1;
+        va = ceph_frag_bits(a);
+        vb = ceph_frag_bits(b);
+        if (va < vb)
+                return -1;
+        if (va > vb)
+                return 1;
+        return 0;
+}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef _FS_CEPH_FRAG_H
+#define _FS_CEPH_FRAG_H
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask.  Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ *   8 upper bits = "bits"
+ *  24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value.  This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically.  However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+        return (b << 24) |
+                (v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+        return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+        return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+        return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+        return 24 - ceph_frag_bits(f);
+}
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+        return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+        /* is sub as specific as us, and contained by us? */
+        return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+               (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+        return ceph_frag_make(ceph_frag_bits(f) - 1,
+                         ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+        return ceph_frag_bits(f) > 0 &&
+                (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+        return ceph_frag_bits(f) > 0 &&
+                (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+        return ceph_frag_make(ceph_frag_bits(f),
+                      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+        return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+        return ceph_frag_make(ceph_frag_bits(f)+1,
+              ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+        int newbits = ceph_frag_bits(f) + by;
+        return ceph_frag_make(newbits,
+                         ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+        return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+        return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+        return ceph_frag_make(ceph_frag_bits(f),
+                         ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include "types.h"
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+        __u32 su = le32_to_cpu(layout->fl_stripe_unit);
+        __u32 sc = le32_to_cpu(layout->fl_stripe_count);
+        __u32 os = le32_to_cpu(layout->fl_object_size);
+        /* stripe unit, object size must be non-zero, 64k increment */
+        if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+                return 0;
+        if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+                return 0;
+        /* object size must be a multiple of stripe unit */
+        if (os < su || os % su)
+                return 0;
+        /* stripe count must be non-zero */
+        if (!sc)
+                return 0;
+        return 1;
+}
+int ceph_flags_to_mode(int flags)
+{
+#ifdef O_DIRECTORY  /* fixme */
+        if ((flags & O_DIRECTORY) == O_DIRECTORY)
+                return CEPH_FILE_MODE_PIN;
+#endif
+#ifdef O_LAZY
+        if (flags & O_LAZY)
+                return CEPH_FILE_MODE_LAZY;
+#endif
+        if ((flags & O_APPEND) == O_APPEND)
+                flags |= O_WRONLY;
+        flags &= O_ACCMODE;
+        if ((flags & O_RDWR) == O_RDWR)
+                return CEPH_FILE_MODE_RDWR;
+        if ((flags & O_WRONLY) == O_WRONLY)
+                return CEPH_FILE_MODE_WR;
+        return CEPH_FILE_MODE_RD;
+}
+int ceph_caps_for_mode(int mode)
+{
+        switch (mode) {
+        case CEPH_FILE_MODE_PIN:
+                return CEPH_CAP_PIN;
+        case CEPH_FILE_MODE_RD:
+                return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+                        CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+        case CEPH_FILE_MODE_RDWR:
+                return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+                        CEPH_CAP_FILE_EXCL |
+                        CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
+                        CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+                        CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+                        CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+        case CEPH_FILE_MODE_WR:
+                return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+                        CEPH_CAP_FILE_EXCL |
+                        CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+                        CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+                        CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+        }
+        return 0;
+}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..0c2241ef3653
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2
+ */
+#ifndef _FS_CEPH_CEPH_FS_H
+#define _FS_CEPH_CEPH_FS_H
+#include "msgr.h"
+#include "rados.h"
+/*
+ * Ceph release version
+ */
+#define CEPH_VERSION_MAJOR 0
+#define CEPH_VERSION_MINOR 19
+#define CEPH_VERSION_PATCH 0
+#define _CEPH_STRINGIFY(x) #x
+#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
+#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
+        "." CEPH_STRINGIFY(z)
+#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
+                                       CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
+/*
+ * subprotocol versions.  when specific messages types or high-level
+ * protocols change, bump the affected components.  we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
+#define CEPH_MDS_PROTOCOL     9 /* cluster internal */
+#define CEPH_MON_PROTOCOL     5 /* cluster internal */
+#define CEPH_OSDC_PROTOCOL   24 /* server/client */
+#define CEPH_MDSC_PROTOCOL   32 /* server/client */
+#define CEPH_MONC_PROTOCOL   15 /* server/client */
+#define CEPH_INO_ROOT  1
+#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON   31
+/*
+ * feature bits
+ */
+#define CEPH_FEATURE_SUPPORTED  0
+#define CEPH_FEATURE_REQUIRED   0
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+        /* file -> object mapping */
+        __le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
+                                      of page size. */
+        __le32 fl_stripe_count;    /* over this many objects */
+        __le32 fl_object_size;     /* until objects are this big, then move to
+                                      new objects */
+        __le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
+        /* pg -> disk layout */
+        __le32 fl_object_stripe_unit;  /* for per-object parity, if any */
+        /* object -> pg layout */
+        __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+        __le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+#define CEPH_MIN_STRIPE_UNIT 65536
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES  0x1
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN       0x0
+#define CEPH_AUTH_NONE          0x1
+#define CEPH_AUTH_CEPHX         0x2
+/*********************************************
+ * message layer
+ */
+/*
+ * message types
+ */
+/* misc */
+#define CEPH_MSG_SHUTDOWN               1
+#define CEPH_MSG_PING                   2
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP                4
+#define CEPH_MSG_MON_GET_MAP            5
+#define CEPH_MSG_STATFS                 13
+#define CEPH_MSG_STATFS_REPLY           14
+#define CEPH_MSG_MON_SUBSCRIBE          15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
+#define CEPH_MSG_AUTH                   17
+#define CEPH_MSG_AUTH_REPLY             18
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP                21
+#define CEPH_MSG_CLIENT_SESSION         22
+#define CEPH_MSG_CLIENT_RECONNECT       23
+#define CEPH_MSG_CLIENT_REQUEST         24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_CAPS            0x310
+#define CEPH_MSG_CLIENT_LEASE           0x311
+#define CEPH_MSG_CLIENT_SNAP            0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+/* osd */
+#define CEPH_MSG_OSD_MAP          41
+#define CEPH_MSG_OSD_OP           42
+#define CEPH_MSG_OSD_OPREPLY      43
+struct ceph_mon_request_header {
+        __le64 have_version;
+        __le16 session_mon;
+        __le64 session_mon_tid;
+} __attribute__ ((packed));
+struct ceph_mon_statfs {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+} __attribute__ ((packed));
+struct ceph_statfs {
+        __le64 kb, kb_used, kb_avail;
+        __le64 num_objects;
+} __attribute__ ((packed));
+struct ceph_mon_statfs_reply {
+        struct ceph_fsid fsid;
+        __le64 version;
+        struct ceph_statfs st;
+} __attribute__ ((packed));
+struct ceph_osd_getmap {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+        __le32 start;
+} __attribute__ ((packed));
+struct ceph_mds_getmap {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+} __attribute__ ((packed));
+struct ceph_client_mount {
+        struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+struct ceph_mon_subscribe_item {
+        __le64 have_version;    __le64 have;
+        __u8 onetime;
+} __attribute__ ((packed));
+struct ceph_mon_subscribe_ack {
+        __le32 duration;         /* seconds */
+        struct ceph_fsid fsid;
+} __attribute__ ((packed));
+/*
+ * mds states
+ *   > 0 -> in
+ *  <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
+                                          empty log. */
+#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
+                                          operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
+extern const char *ceph_mds_state_name(int s);
+/*
+ * metadata lock types.
+ *  - these are bitmasks.. we can compose them
+ *  - they also define the lock ordering by the MDS
+ *  - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DN          1
+#define CEPH_LOCK_ISNAP       2
+#define CEPH_LOCK_IVERSION    4     /* mds internal */
+#define CEPH_LOCK_IFILE       8     /* mds internal */
+#define CEPH_LOCK_IAUTH       32
+#define CEPH_LOCK_ILINK       64
+#define CEPH_LOCK_IDFT        128   /* dir frag tree */
+#define CEPH_LOCK_INEST       256   /* mds internal */
+#define CEPH_LOCK_IXATTR      512
+#define CEPH_LOCK_INO         2048  /* immutable inode bits; not a lock */
+/* client_session ops */
+enum {
+        CEPH_SESSION_REQUEST_OPEN,
+        CEPH_SESSION_OPEN,
+        CEPH_SESSION_REQUEST_CLOSE,
+        CEPH_SESSION_CLOSE,
+        CEPH_SESSION_REQUEST_RENEWCAPS,
+        CEPH_SESSION_RENEWCAPS,
+        CEPH_SESSION_STALE,
+        CEPH_SESSION_RECALL_STATE,
+};
+extern const char *ceph_session_op_name(int op);
+struct ceph_mds_session_head {
+        __le32 op;
+        __le64 seq;
+        struct ceph_timespec stamp;
+        __le32 max_caps, max_leases;
+} __attribute__ ((packed));
+/* client_request */
+/*
+ * metadata ops.
+ *  & 0x001000 -> write op
+ *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ &  & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE        0x001000
+enum {
+        CEPH_MDS_OP_LOOKUP     = 0x00100,
+        CEPH_MDS_OP_GETATTR    = 0x00101,
+        CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+        CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+        CEPH_MDS_OP_SETXATTR   = 0x01105,
+        CEPH_MDS_OP_RMXATTR    = 0x01106,
+        CEPH_MDS_OP_SETLAYOUT  = 0x01107,
+        CEPH_MDS_OP_SETATTR    = 0x01108,
+        CEPH_MDS_OP_MKNOD      = 0x01201,
+        CEPH_MDS_OP_LINK       = 0x01202,
+        CEPH_MDS_OP_UNLINK     = 0x01203,
+        CEPH_MDS_OP_RENAME     = 0x01204,
+        CEPH_MDS_OP_MKDIR      = 0x01220,
+        CEPH_MDS_OP_RMDIR      = 0x01221,
+        CEPH_MDS_OP_SYMLINK    = 0x01222,
+        CEPH_MDS_OP_CREATE     = 0x01301,
+        CEPH_MDS_OP_OPEN       = 0x00302,
+        CEPH_MDS_OP_READDIR    = 0x00305,
+        CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+        CEPH_MDS_OP_MKSNAP     = 0x01400,
+        CEPH_MDS_OP_RMSNAP     = 0x01401,
+        CEPH_MDS_OP_LSSNAP     = 0x00402,
+};
+extern const char *ceph_mds_op_name(int op);
+#define CEPH_SETATTR_MODE   1
+#define CEPH_SETATTR_UID    2
+#define CEPH_SETATTR_GID    4
+#define CEPH_SETATTR_MTIME  8
+#define CEPH_SETATTR_ATIME 16
+#define CEPH_SETATTR_SIZE  32
+#define CEPH_SETATTR_CTIME 64
+union ceph_mds_request_args {
+        struct {
+                __le32 mask;                 /* CEPH_CAP_* */
+        } __attribute__ ((packed)) getattr;
+        struct {
+                __le32 mode;
+                __le32 uid;
+                __le32 gid;
+                struct ceph_timespec mtime;
+                struct ceph_timespec atime;
+                __le64 size, old_size;       /* old_size needed by truncate */
+                __le32 mask;                 /* CEPH_SETATTR_* */
+        } __attribute__ ((packed)) setattr;
+        struct {
+                __le32 frag;                 /* which dir fragment */
+                __le32 max_entries;          /* how many dentries to grab */
+        } __attribute__ ((packed)) readdir;
+        struct {
+                __le32 mode;
+                __le32 rdev;
+        } __attribute__ ((packed)) mknod;
+        struct {
+                __le32 mode;
+        } __attribute__ ((packed)) mkdir;
+        struct {
+                __le32 flags;
+                __le32 mode;
+                __le32 stripe_unit;          /* layout for newly created file */
+                __le32 stripe_count;         /* ... */
+                __le32 object_size;
+                __le32 file_replication;
+                __le32 preferred;
+        } __attribute__ ((packed)) open;
+        struct {
+                __le32 flags;
+        } __attribute__ ((packed)) setxattr;
+        struct {
+                struct ceph_file_layout layout;
+        } __attribute__ ((packed)) setlayout;
+} __attribute__ ((packed));
+#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
+struct ceph_mds_request_head {
+        __le64 oldest_client_tid;
+        __le32 mdsmap_epoch;           /* on client */
+        __le32 flags;                  /* CEPH_MDS_FLAG_* */
+        __u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+        __le16 num_releases;           /* # include cap/lease release records */
+        __le32 op;                     /* mds op code */
+        __le32 caller_uid, caller_gid;
+        __le64 ino;                    /* use this ino for openc, mkdir, mknod,
+                                          etc. (if replaying) */
+        union ceph_mds_request_args args;
+} __attribute__ ((packed));
+/* cap/lease release record */
+struct ceph_mds_request_release {
+        __le64 ino, cap_id;            /* ino and unique cap id */
+        __le32 caps, wanted;           /* new issued, wanted */
+        __le32 seq, issue_seq, mseq;
+        __le32 dname_seq;              /* if releasing a dentry lease, a */
+        __le32 dname_len;              /* string follows. */
+} __attribute__ ((packed));
+/* client reply */
+struct ceph_mds_reply_head {
+        __le32 op;
+        __le32 result;
+        __le32 mdsmap_epoch;
+        __u8 safe;                     /* true if committed to disk */
+        __u8 is_dentry, is_target;     /* true if dentry, target inode records
+                                          are included with reply */
+} __attribute__ ((packed));
+/* one for each node split */
+struct ceph_frag_tree_split {
+        __le32 frag;                   /* this frag splits... */
+        __le32 by;                     /* ...by this many bits */
+} __attribute__ ((packed));
+struct ceph_frag_tree_head {
+        __le32 nsplits;                /* num ceph_frag_tree_split records */
+        struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+        __le32 caps, wanted;           /* caps issued, wanted */
+        __le64 cap_id;
+        __le32 seq, mseq;
+        __le64 realm;                  /* snap realm */
+        __u8 flags;                    /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
+/* inode record, for bundling with mds reply */
+struct ceph_mds_reply_inode {
+        __le64 ino;
+        __le64 snapid;
+        __le32 rdev;
+        __le64 version;                /* inode version */
+        __le64 xattr_version;          /* version for xattr blob */
+        struct ceph_mds_reply_cap cap; /* caps issued for this inode */
+        struct ceph_file_layout layout;
+        struct ceph_timespec ctime, mtime, atime;
+        __le32 time_warp_seq;
+        __le64 size, max_size, truncate_size;
+        __le32 truncate_seq;
+        __le32 mode, uid, gid;
+        __le32 nlink;
+        __le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
+        struct ceph_timespec rctime;
+        struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
+} __attribute__ ((packed));
+/* followed by frag array, then symlink string, then xattr blob */
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+        __le16 mask;            /* lease type(s) */
+        __le32 duration_ms;     /* lease duration */
+        __le32 seq;
+} __attribute__ ((packed));
+struct ceph_mds_reply_dirfrag {
+        __le32 frag;            /* fragment */
+        __le32 auth;            /* auth mds, if this is a delegation point */
+        __le32 ndist;           /* number of mds' this is replicated on */
+        __le32 dist[];
+} __attribute__ ((packed));
+/* file access modes */
+#define CEPH_FILE_MODE_PIN        0
+#define CEPH_FILE_MODE_RD         1
+#define CEPH_FILE_MODE_WR         2
+#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
+#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
+#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
+int ceph_flags_to_mode(int flags);
+/* capability bits */
+#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
+/* generic cap bits */
+#define CEPH_CAP_GSHARED     1  /* client can reads */
+#define CEPH_CAP_GEXCL       2  /* client can read and update */
+#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
+#define CEPH_CAP_GRD         8  /* (file) client can read */
+#define CEPH_CAP_GWR        16  /* (file) client can write */
+#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
+/* per-lock shift */
+#define CEPH_CAP_SAUTH      2
+#define CEPH_CAP_SLINK      4
+#define CEPH_CAP_SXATTR     6
+#define CEPH_CAP_SFILE      8   /* goes at the end (uses >2 cap bits) */
+#define CEPH_CAP_BITS       16
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
+#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |                 \
+                                 CEPH_CAP_AUTH_SHARED | \
+                                 CEPH_CAP_LINK_SHARED | \
+                                 CEPH_CAP_FILE_SHARED | \
+                                 CEPH_CAP_XATTR_SHARED)
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |                     \
+                              CEPH_CAP_LINK_SHARED |                    \
+                              CEPH_CAP_XATTR_SHARED |                   \
+                              CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |     \
+                           CEPH_CAP_FILE_CACHE)
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |         \
+                           CEPH_CAP_LINK_EXCL |         \
+                           CEPH_CAP_XATTR_EXCL |        \
+                           CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
+                              CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+                           CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+                        CEPH_LOCK_IXATTR)
+int ceph_caps_for_mode(int mode);
+enum {
+        CEPH_CAP_OP_GRANT,         /* mds->client grant */
+        CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
+        CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
+        CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
+        CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
+        CEPH_CAP_OP_UPDATE,        /* client->mds update */
+        CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
+        CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
+        CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
+        CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
+        CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+        CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
+        CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
+};
+extern const char *ceph_cap_op_name(int op);
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps {
+        __le32 op;                  /* CEPH_CAP_OP_* */
+        __le64 ino, realm;
+        __le64 cap_id;
+        __le32 seq, issue_seq;
+        __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+        __le32 migrate_seq;
+        __le64 snap_follows;
+        __le32 snap_trace_len;
+        /* authlock */
+        __le32 uid, gid, mode;
+        /* linklock */
+        __le32 nlink;
+        /* xattrlock */
+        __le32 xattr_len;
+        __le64 xattr_version;
+        /* filelock */
+        __le64 size, max_size, truncate_size;
+        __le32 truncate_seq;
+        struct ceph_timespec mtime, atime, ctime;
+        struct ceph_file_layout layout;
+        __le32 time_warp_seq;
+} __attribute__ ((packed));
+/* cap release msg head */
+struct ceph_mds_cap_release {
+        __le32 num;                /* number of cap_items that follow */
+} __attribute__ ((packed));
+struct ceph_mds_cap_item {
+        __le64 ino;
+        __le64 cap_id;
+        __le32 migrate_seq, seq;
+} __attribute__ ((packed));
+#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
+#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
+#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
+#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
+extern const char *ceph_lease_op_name(int o);
+/* lease msg header */
+struct ceph_mds_lease {
+        __u8 action;            /* CEPH_MDS_LEASE_* */
+        __le16 mask;            /* which lease */
+        __le64 ino;
+        __le64 first, last;     /* snap range */
+        __le32 seq;
+        __le32 duration_ms;     /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+        __le64 cap_id;
+        __le32 wanted;
+        __le32 issued;
+        __le64 size;
+        struct ceph_timespec mtime, atime;
+        __le64 snaprealm;
+        __le64 pathbase;        /* base ino for our path to this ino */
+} __attribute__ ((packed));
+/* followed by encoded string */
+struct ceph_mds_snaprealm_reconnect {
+        __le64 ino;     /* snap realm base */
+        __le64 seq;     /* snap seq for this snap realm */
+        __le64 parent;  /* parent realm */
+} __attribute__ ((packed));
+/*
+ * snaps
+ */
+enum {
+        CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
+        CEPH_SNAP_OP_CREATE,
+        CEPH_SNAP_OP_DESTROY,
+        CEPH_SNAP_OP_SPLIT,
+};
+extern const char *ceph_snap_op_name(int o);
+/* snap msg header */
+struct ceph_mds_snap_head {
+        __le32 op;                /* CEPH_SNAP_OP_* */
+        __le64 split;             /* ino to split off, if any */
+        __le32 num_split_inos;    /* # inos belonging to new child realm */
+        __le32 num_split_realms;  /* # child realms udner new child realm */
+        __le32 trace_len;         /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+        __le64 ino;           /* ino */
+        __le64 created;       /* snap: when created */
+        __le64 parent;        /* ino: parent realm */
+        __le64 parent_since;  /* snap: same parent since */
+        __le64 seq;           /* snap: version */
+        __le32 num_snaps;
+        __le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..bd570015d147
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
+#include "types.h"
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c)                                            \
+        do {                                                    \
+                a = a - b;  a = a - c;  a = a ^ (c >> 13);      \
+                b = b - c;  b = b - a;  b = b ^ (a << 8);       \
+                c = c - a;  c = c - b;  c = c ^ (b >> 13);      \
+                a = a - b;  a = a - c;  a = a ^ (c >> 12);      \
+                b = b - c;  b = b - a;  b = b ^ (a << 16);      \
+                c = c - a;  c = c - b;  c = c ^ (b >> 5);       \
+                a = a - b;  a = a - c;  a = a ^ (c >> 3);       \
+                b = b - c;  b = b - a;  b = b ^ (a << 10);      \
+                c = c - a;  c = c - b;  c = c ^ (b >> 15);      \
+        } while (0)
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+        const unsigned char *k = (const unsigned char *)str;
+        __u32 a, b, c;  /* the internal state */
+        __u32 len;      /* how many key bytes still need mixing */
+        /* Set up the internal state */
+        len = length;
+        a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
+        b = a;
+        c = 0;               /* variable initialization of internal state */
+        /* handle most of the key */
+        while (len >= 12) {
+                a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+                         ((__u32)k[3] << 24));
+                b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+                         ((__u32)k[7] << 24));
+                c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+                         ((__u32)k[11] << 24));
+                mix(a, b, c);
+                k = k + 12;
+                len = len - 12;
+        }
+        /* handle the last 11 bytes */
+        c = c + length;
+        switch (len) {            /* all the case statements fall through */
+        case 11:
+                c = c + ((__u32)k[10] << 24);
+        case 10:
+                c = c + ((__u32)k[9] << 16);
+        case 9:
+                c = c + ((__u32)k[8] << 8);
+                /* the first byte of c is reserved for the length */
+        case 8:
+                b = b + ((__u32)k[7] << 24);
+        case 7:
+                b = b + ((__u32)k[6] << 16);
+        case 6:
+                b = b + ((__u32)k[5] << 8);
+        case 5:
+                b = b + k[4];
+        case 4:
+                a = a + ((__u32)k[3] << 24);
+        case 3:
+                a = a + ((__u32)k[2] << 16);
+        case 2:
+                a = a + ((__u32)k[1] << 8);
+        case 1:
+                a = a + k[0];
+                /* case 0: nothing left to add */
+        }
+        mix(a, b, c);
+        return c;
+}
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+        unsigned long hash = 0;
+        unsigned char c;
+        while (length--) {
+                c = *str++;
+                hash = (hash + (c << 4) + (c >> 4)) * 11;
+        }
+        return hash;
+}
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+        switch (type) {
+        case CEPH_STR_HASH_LINUX:
+                return ceph_str_hash_linux(s, len);
+        case CEPH_STR_HASH_RJENKINS:
+                return ceph_str_hash_rjenkins(s, len);
+        default:
+                return -1;
+        }
+}
+const char *ceph_str_hash_name(int type)
+{
+        switch (type) {
+        case CEPH_STR_HASH_LINUX:
+                return "linux";
+        case CEPH_STR_HASH_RJENKINS:
+                return "rjenkins";
+        default:
+                return "unknown";
+        }
+}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
+#ifndef _FS_CEPH_HASH_H
+#define _FS_CEPH_HASH_H
+#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..8e4be6a80c62
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
+/*
+ * Ceph string constants
+ */
+#include "types.h"
+const char *ceph_entity_type_name(int type)
+{
+        switch (type) {
+        case CEPH_ENTITY_TYPE_MDS: return "mds";
+        case CEPH_ENTITY_TYPE_OSD: return "osd";
+        case CEPH_ENTITY_TYPE_MON: return "mon";
+        case CEPH_ENTITY_TYPE_CLIENT: return "client";
+        case CEPH_ENTITY_TYPE_ADMIN: return "admin";
+        case CEPH_ENTITY_TYPE_AUTH: return "auth";
+        default: return "unknown";
+        }
+}
+const char *ceph_osd_op_name(int op)
+{
+        switch (op) {
+        case CEPH_OSD_OP_READ: return "read";
+        case CEPH_OSD_OP_STAT: return "stat";
+        case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+        case CEPH_OSD_OP_WRITE: return "write";
+        case CEPH_OSD_OP_DELETE: return "delete";
+        case CEPH_OSD_OP_TRUNCATE: return "truncate";
+        case CEPH_OSD_OP_ZERO: return "zero";
+        case CEPH_OSD_OP_WRITEFULL: return "writefull";
+        case CEPH_OSD_OP_APPEND: return "append";
+        case CEPH_OSD_OP_STARTSYNC: return "startsync";
+        case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+        case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+        case CEPH_OSD_OP_TMAPUP: return "tmapup";
+        case CEPH_OSD_OP_TMAPGET: return "tmapget";
+        case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+        case CEPH_OSD_OP_GETXATTR: return "getxattr";
+        case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+        case CEPH_OSD_OP_SETXATTR: return "setxattr";
+        case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+        case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+        case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+        case CEPH_OSD_OP_PULL: return "pull";
+        case CEPH_OSD_OP_PUSH: return "push";
+        case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+        case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+        case CEPH_OSD_OP_SCRUB: return "scrub";
+        case CEPH_OSD_OP_WRLOCK: return "wrlock";
+        case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+        case CEPH_OSD_OP_RDLOCK: return "rdlock";
+        case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+        case CEPH_OSD_OP_UPLOCK: return "uplock";
+        case CEPH_OSD_OP_DNLOCK: return "dnlock";
+        case CEPH_OSD_OP_CALL: return "call";
+        case CEPH_OSD_OP_PGLS: return "pgls";
+        }
+        return "???";
+}
+const char *ceph_mds_state_name(int s)
+{
+        switch (s) {
+                /* down and out */
+        case CEPH_MDS_STATE_DNE:        return "down:dne";
+        case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
+                /* up and out */
+        case CEPH_MDS_STATE_BOOT:       return "up:boot";
+        case CEPH_MDS_STATE_STANDBY:    return "up:standby";
+        case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+        case CEPH_MDS_STATE_CREATING:   return "up:creating";
+        case CEPH_MDS_STATE_STARTING:   return "up:starting";
+                /* up and in */
+        case CEPH_MDS_STATE_REPLAY:     return "up:replay";
+        case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
+        case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
+        case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
+        case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+        case CEPH_MDS_STATE_ACTIVE:     return "up:active";
+        case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
+        }
+        return "???";
+}
+const char *ceph_session_op_name(int op)
+{
+        switch (op) {
+        case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+        case CEPH_SESSION_OPEN: return "open";
+        case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+        case CEPH_SESSION_CLOSE: return "close";
+        case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+        case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+        case CEPH_SESSION_STALE: return "stale";
+        case CEPH_SESSION_RECALL_STATE: return "recall_state";
+        }
+        return "???";
+}
+const char *ceph_mds_op_name(int op)
+{
+        switch (op) {
+        case CEPH_MDS_OP_LOOKUP:  return "lookup";
+        case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
+        case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+        case CEPH_MDS_OP_GETATTR:  return "getattr";
+        case CEPH_MDS_OP_SETXATTR: return "setxattr";
+        case CEPH_MDS_OP_SETATTR: return "setattr";
+        case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+        case CEPH_MDS_OP_READDIR: return "readdir";
+        case CEPH_MDS_OP_MKNOD: return "mknod";
+        case CEPH_MDS_OP_LINK: return "link";
+        case CEPH_MDS_OP_UNLINK: return "unlink";
+        case CEPH_MDS_OP_RENAME: return "rename";
+        case CEPH_MDS_OP_MKDIR: return "mkdir";
+        case CEPH_MDS_OP_RMDIR: return "rmdir";
+        case CEPH_MDS_OP_SYMLINK: return "symlink";
+        case CEPH_MDS_OP_CREATE: return "create";
+        case CEPH_MDS_OP_OPEN: return "open";
+        case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+        case CEPH_MDS_OP_LSSNAP: return "lssnap";
+        case CEPH_MDS_OP_MKSNAP: return "mksnap";
+        case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+        }
+        return "???";
+}
+const char *ceph_cap_op_name(int op)
+{
+        switch (op) {
+        case CEPH_CAP_OP_GRANT: return "grant";
+        case CEPH_CAP_OP_REVOKE: return "revoke";
+        case CEPH_CAP_OP_TRUNC: return "trunc";
+        case CEPH_CAP_OP_EXPORT: return "export";
+        case CEPH_CAP_OP_IMPORT: return "import";
+        case CEPH_CAP_OP_UPDATE: return "update";
+        case CEPH_CAP_OP_DROP: return "drop";
+        case CEPH_CAP_OP_FLUSH: return "flush";
+        case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+        case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+        case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+        case CEPH_CAP_OP_RELEASE: return "release";
+        case CEPH_CAP_OP_RENEW: return "renew";
+        }
+        return "???";
+}
+const char *ceph_lease_op_name(int o)
+{
+        switch (o) {
+        case CEPH_MDS_LEASE_REVOKE: return "revoke";
+        case CEPH_MDS_LEASE_RELEASE: return "release";
+        case CEPH_MDS_LEASE_RENEW: return "renew";
+        case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+        }
+        return "???";
+}
+const char *ceph_snap_op_name(int o)
+{
+        switch (o) {
+        case CEPH_SNAP_OP_UPDATE: return "update";
+        case CEPH_SNAP_OP_CREATE: return "create";
+        case CEPH_SNAP_OP_DESTROY: return "destroy";
+        case CEPH_SNAP_OP_SPLIT: return "split";
+        }
+        return "???";
+}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+#include "crush.h"
+const char *crush_bucket_alg_name(int alg)
+{
+        switch (alg) {
+        case CRUSH_BUCKET_UNIFORM: return "uniform";
+        case CRUSH_BUCKET_LIST: return "list";
+        case CRUSH_BUCKET_TREE: return "tree";
+        case CRUSH_BUCKET_STRAW: return "straw";
+        default: return "unknown";
+        }
+}
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+{
+        if (p >= b->size)
+                return 0;
+        switch (b->alg) {
+        case CRUSH_BUCKET_UNIFORM:
+                return ((struct crush_bucket_uniform *)b)->item_weight;
+        case CRUSH_BUCKET_LIST:
+                return ((struct crush_bucket_list *)b)->item_weights[p];
+        case CRUSH_BUCKET_TREE:
+                if (p & 1)
+                        return ((struct crush_bucket_tree *)b)->node_weights[p];
+                return 0;
+        case CRUSH_BUCKET_STRAW:
+                return ((struct crush_bucket_straw *)b)->item_weights[p];
+        }
+        return 0;
+}
+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
+void crush_calc_parents(struct crush_map *map)
+{
+        int i, b, c;
+        for (b = 0; b < map->max_buckets; b++) {
+                if (map->buckets[b] == NULL)
+                        continue;
+                for (i = 0; i < map->buckets[b]->size; i++) {
+                        c = map->buckets[b]->items[i];
+                        BUG_ON(c >= map->max_devices ||
+                               c < -map->max_buckets);
+                        if (c >= 0)
+                                map->device_parents[c] = map->buckets[b]->id;
+                        else
+                                map->bucket_parents[-1-c] = map->buckets[b]->id;
+                }
+        }
+}
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+        kfree(b->h.perm);
+        kfree(b->h.items);
+        kfree(b);
+}
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+        kfree(b->item_weights);
+        kfree(b->sum_weights);
+        kfree(b->h.perm);
+        kfree(b->h.items);
+        kfree(b);
+}
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+        kfree(b->node_weights);
+        kfree(b);
+}
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+        kfree(b->straws);
+        kfree(b->item_weights);
+        kfree(b->h.perm);
+        kfree(b->h.items);
+        kfree(b);
+}
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+        switch (b->alg) {
+        case CRUSH_BUCKET_UNIFORM:
+                crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+                break;
+        case CRUSH_BUCKET_LIST:
+                crush_destroy_bucket_list((struct crush_bucket_list *)b);
+                break;
+        case CRUSH_BUCKET_TREE:
+                crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+                break;
+        case CRUSH_BUCKET_STRAW:
+                crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+                break;
+        }
+}
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+        int b;
+        /* buckets */
+        if (map->buckets) {
+                for (b = 0; b < map->max_buckets; b++) {
+                        if (map->buckets[b] == NULL)
+                                continue;
+                        crush_destroy_bucket(map->buckets[b]);
+                }
+                kfree(map->buckets);
+        }
+        /* rules */
+        if (map->rules) {
+                for (b = 0; b < map->max_rules; b++)
+                        kfree(map->rules[b]);
+                kfree(map->rules);
+        }
+        kfree(map->bucket_parents);
+        kfree(map->device_parents);
+        kfree(map);
+}
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
+#ifndef _CRUSH_CRUSH_H
+#define _CRUSH_CRUSH_H
+#include <linux/types.h>
+/*
+ * CRUSH is a pseudo-random data distribution algorithm that
+ * efficiently distributes input values (typically, data objects)
+ * across a heterogeneous, structured storage cluster.
+ *
+ * The algorithm was originally described in detail in this paper
+ * (although the algorithm has evolved somewhat since then):
+ *
+ *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+ *
+ * LGPL2
+ */
+#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
+#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
+#define CRUSH_MAX_SET   10  /* max size of a mapping result */
+/*
+ * CRUSH uses user-defined "rules" to describe how inputs should be
+ * mapped to devices.  A rule consists of sequence of steps to perform
+ * to generate the set of output devices.
+ */
+struct crush_rule_step {
+        __u32 op;
+        __s32 arg1;
+        __s32 arg2;
+};
+/* step op codes */
+enum {
+        CRUSH_RULE_NOOP = 0,
+        CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
+        CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
+                                      /* arg2 = type */
+        CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
+        CRUSH_RULE_EMIT = 4,          /* no args */
+        CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
+        CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
+};
+/*
+ * for specifying choose num (arg1) relative to the max parameter
+ * passed to do_rule
+ */
+#define CRUSH_CHOOSE_N            0
+#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
+/*
+ * The rule mask is used to describe what the rule is intended for.
+ * Given a ruleset and size of output set, we search through the
+ * rule list for a matching rule_mask.
+ */
+struct crush_rule_mask {
+        __u8 ruleset;
+        __u8 type;
+        __u8 min_size;
+        __u8 max_size;
+};
+struct crush_rule {
+        __u32 len;
+        struct crush_rule_mask mask;
+        struct crush_rule_step steps[0];
+};
+#define crush_rule_size(len) (sizeof(struct crush_rule) + \
+                              (len)*sizeof(struct crush_rule_step))
+/*
+ * A bucket is a named container of other items (either devices or
+ * other buckets).  Items within a bucket are chosen using one of a
+ * few different algorithms.  The table summarizes how the speed of
+ * each option measures up against mapping stability when items are
+ * added or removed.
+ *
+ *  Bucket Alg     Speed       Additions    Removals
+ *  ------------------------------------------------
+ *  uniform         O(1)       poor         poor
+ *  list            O(n)       optimal      poor
+ *  tree            O(log n)   good         good
+ *  straw           O(n)       optimal      optimal
+ */
+enum {
+        CRUSH_BUCKET_UNIFORM = 1,
+        CRUSH_BUCKET_LIST = 2,
+        CRUSH_BUCKET_TREE = 3,
+        CRUSH_BUCKET_STRAW = 4
+};
+extern const char *crush_bucket_alg_name(int alg);
+struct crush_bucket {
+        __s32 id;        /* this'll be negative */
+        __u16 type;      /* non-zero; type=0 is reserved for devices */
+        __u8 alg;        /* one of CRUSH_BUCKET_* */
+        __u8 hash;       /* which hash function to use, CRUSH_HASH_* */
+        __u32 weight;    /* 16-bit fixed point */
+        __u32 size;      /* num items */
+        __s32 *items;
+        /*
+         * cached random permutation: used for uniform bucket and for
+         * the linear search fallback for the other bucket types.
+         */
+        __u32 perm_x;  /* @x for which *perm is defined */
+        __u32 perm_n;  /* num elements of *perm that are permuted/defined */
+        __u32 *perm;
+};
+struct crush_bucket_uniform {
+        struct crush_bucket h;
+        __u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
+};
+struct crush_bucket_list {
+        struct crush_bucket h;
+        __u32 *item_weights;  /* 16-bit fixed point */
+        __u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
+                                 of weights 0..i, inclusive */
+};
+struct crush_bucket_tree {
+        struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
+                                   actual items */
+        __u8 num_nodes;
+        __u32 *node_weights;
+};
+struct crush_bucket_straw {
+        struct crush_bucket h;
+        __u32 *item_weights;   /* 16-bit fixed point */
+        __u32 *straws;         /* 16-bit fixed point */
+};
+/*
+ * CRUSH map includes all buckets, rules, etc.
+ */
+struct crush_map {
+        struct crush_bucket **buckets;
+        struct crush_rule **rules;
+        /*
+         * Parent pointers to identify the parent bucket a device or
+         * bucket in the hierarchy.  If an item appears more than
+         * once, this is the _last_ time it appeared (where buckets
+         * are processed in bucket id order, from -1 on down to
+         * -max_buckets.
+         */
+        __u32 *bucket_parents;
+        __u32 *device_parents;
+        __s32 max_buckets;
+        __u32 max_rules;
+        __s32 max_devices;
+};
+/* crush.c */
+extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
+extern void crush_calc_parents(struct crush_map *map);
+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
+extern void crush_destroy_bucket(struct crush_bucket *b);
+extern void crush_destroy(struct crush_map *map);
+#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
+#include <linux/types.h>
+#include "hash.h"
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do {                     \
+                a = a-b;  a = a-c;  a = a^(c>>13);      \
+                b = b-c;  b = b-a;  b = b^(a<<8);       \
+                c = c-a;  c = c-b;  c = c^(b>>13);      \
+                a = a-b;  a = a-c;  a = a^(c>>12);      \
+                b = b-c;  b = b-a;  b = b^(a<<16);      \
+                c = c-a;  c = c-b;  c = c^(b>>5);       \
+                a = a-b;  a = a-c;  a = a^(c>>3);       \
+                b = b-c;  b = b-a;  b = b^(a<<10);      \
+                c = c-a;  c = c-b;  c = c^(b>>15);      \
+        } while (0)
+#define crush_hash_seed 1315423911
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+        __u32 hash = crush_hash_seed ^ a;
+        __u32 b = a;
+        __u32 x = 231232;
+        __u32 y = 1232;
+        crush_hashmix(b, x, hash);
+        crush_hashmix(y, a, hash);
+        return hash;
+}
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+        __u32 hash = crush_hash_seed ^ a ^ b;
+        __u32 x = 231232;
+        __u32 y = 1232;
+        crush_hashmix(a, b, hash);
+        crush_hashmix(x, a, hash);
+        crush_hashmix(b, y, hash);
+        return hash;
+}
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+        __u32 hash = crush_hash_seed ^ a ^ b ^ c;
+        __u32 x = 231232;
+        __u32 y = 1232;
+        crush_hashmix(a, b, hash);
+        crush_hashmix(c, x, hash);
+        crush_hashmix(y, a, hash);
+        crush_hashmix(b, x, hash);
+        crush_hashmix(y, c, hash);
+        return hash;
+}
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+        __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+        __u32 x = 231232;
+        __u32 y = 1232;
+        crush_hashmix(a, b, hash);
+        crush_hashmix(c, d, hash);
+        crush_hashmix(a, x, hash);
+        crush_hashmix(y, b, hash);
+        crush_hashmix(c, x, hash);
+        crush_hashmix(y, d, hash);
+        return hash;
+}
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+                                      __u32 e)
+{
+        __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+        __u32 x = 231232;
+        __u32 y = 1232;
+        crush_hashmix(a, b, hash);
+        crush_hashmix(c, d, hash);
+        crush_hashmix(e, x, hash);
+        crush_hashmix(y, a, hash);
+        crush_hashmix(b, x, hash);
+        crush_hashmix(y, c, hash);
+        crush_hashmix(d, x, hash);
+        crush_hashmix(y, e, hash);
+        return hash;
+}
+__u32 crush_hash32(int type, __u32 a)
+{
+        switch (type) {
+        case CRUSH_HASH_RJENKINS1:
+                return crush_hash32_rjenkins1(a);
+        default:
+                return 0;
+        }
+}
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+        switch (type) {
+        case CRUSH_HASH_RJENKINS1:
+                return crush_hash32_rjenkins1_2(a, b);
+        default:
+                return 0;
+        }
+}
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+        switch (type) {
+        case CRUSH_HASH_RJENKINS1:
+                return crush_hash32_rjenkins1_3(a, b, c);
+        default:
+                return 0;
+        }
+}
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+        switch (type) {
+        case CRUSH_HASH_RJENKINS1:
+                return crush_hash32_rjenkins1_4(a, b, c, d);
+        default:
+                return 0;
+        }
+}
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+        switch (type) {
+        case CRUSH_HASH_RJENKINS1:
+                return crush_hash32_rjenkins1_5(a, b, c, d, e);
+        default:
+                return 0;
+        }
+}
+const char *crush_hash_name(int type)
+{
+        switch (type) {
+        case CRUSH_HASH_RJENKINS1:
+                return "rjenkins1";
+        default:
+                return "unknown";
+        }
+}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
+#ifndef _CRUSH_HASH_H
+#define _CRUSH_HASH_H
+#define CRUSH_HASH_RJENKINS1   0
+#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
+extern const char *crush_hash_name(int type);
+extern __u32 crush_hash32(int type, __u32 a);
+extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
+extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
+extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
+extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
+                            __u32 e);
+#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..9ba54efb6543
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+#  define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+#include "crush.h"
+#include "hash.h"
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+{
+        int i;
+        for (i = 0; i < map->max_rules; i++) {
+                if (map->rules[i] &&
+                    map->rules[i]->mask.ruleset == ruleset &&
+                    map->rules[i]->mask.type == type &&
+                    map->rules[i]->mask.min_size <= size &&
+                    map->rules[i]->mask.max_size >= size)
+                        return i;
+        }
+        return -1;
+}
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors.  Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+                              int x, int r)
+{
+        unsigned pr = r % bucket->size;
+        unsigned i, s;
+        /* start a new permutation if @x has changed */
+        if (bucket->perm_x != x || bucket->perm_n == 0) {
+                dprintk("bucket %d new x=%d\n", bucket->id, x);
+                bucket->perm_x = x;
+                /* optimize common r=0 case */
+                if (pr == 0) {
+                        s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+                                bucket->size;
+                        bucket->perm[0] = s;
+                        bucket->perm_n = 0xffff;   /* magic value, see below */
+                        goto out;
+                }
+                for (i = 0; i < bucket->size; i++)
+                        bucket->perm[i] = i;
+                bucket->perm_n = 0;
+        } else if (bucket->perm_n == 0xffff) {
+                /* clean up after the r=0 case above */
+                for (i = 1; i < bucket->size; i++)
+                        bucket->perm[i] = i;
+                bucket->perm[bucket->perm[0]] = 0;
+                bucket->perm_n = 1;
+        }
+        /* calculate permutation up to pr */
+        for (i = 0; i < bucket->perm_n; i++)
+                dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+        while (bucket->perm_n <= pr) {
+                unsigned p = bucket->perm_n;
+                /* no point in swapping the final entry */
+                if (p < bucket->size - 1) {
+                        i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+                                (bucket->size - p);
+                        if (i) {
+                                unsigned t = bucket->perm[p + i];
+                                bucket->perm[p + i] = bucket->perm[p];
+                                bucket->perm[p] = t;
+                        }
+                        dprintk(" perm_choose swap %d with %d\n", p, p+i);
+                }
+                bucket->perm_n++;
+        }
+        for (i = 0; i < bucket->size; i++)
+                dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+        s = bucket->perm[pr];
+out:
+        dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+                bucket->size, x, r, pr, s);
+        return bucket->items[s];
+}
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+                                 int x, int r)
+{
+        return bucket_perm_choose(&bucket->h, x, r);
+}
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+                              int x, int r)
+{
+        int i;
+        for (i = bucket->h.size-1; i >= 0; i--) {
+                __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+                                         r, bucket->h.id);
+                w &= 0xffff;
+                dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+                        "sw %x rand %llx",
+                        i, x, r, bucket->h.items[i], bucket->item_weights[i],
+                        bucket->sum_weights[i], w);
+                w *= bucket->sum_weights[i];
+                w = w >> 16;
+                /*dprintk(" scaled %llx\n", w);*/
+                if (w < bucket->item_weights[i])
+                        return bucket->h.items[i];
+        }
+        BUG_ON(1);
+        return 0;
+}
+/* (binary) tree */
+static int height(int n)
+{
+        int h = 0;
+        while ((n & 1) == 0) {
+                h++;
+                n = n >> 1;
+        }
+        return h;
+}
+static int left(int x)
+{
+        int h = height(x);
+        return x - (1 << (h-1));
+}
+static int right(int x)
+{
+        int h = height(x);
+        return x + (1 << (h-1));
+}
+static int terminal(int x)
+{
+        return x & 1;
+}
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+                              int x, int r)
+{
+        int n, l;
+        __u32 w;
+        __u64 t;
+        /* start at root */
+        n = bucket->num_nodes >> 1;
+        while (!terminal(n)) {
+                /* pick point in [0, w) */
+                w = bucket->node_weights[n];
+                t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+                                          bucket->h.id) * (__u64)w;
+                t = t >> 32;
+                /* descend to the left or right? */
+                l = left(n);
+                if (t < bucket->node_weights[l])
+                        n = l;
+                else
+                        n = right(n);
+        }
+        return bucket->h.items[n >> 1];
+}
+/* straw */
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+                               int x, int r)
+{
+        int i;
+        int high = 0;
+        __u64 high_draw = 0;
+        __u64 draw;
+        for (i = 0; i < bucket->h.size; i++) {
+                draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+                draw &= 0xffff;
+                draw *= bucket->straws[i];
+                if (i == 0 || draw > high_draw) {
+                        high = i;
+                        high_draw = draw;
+                }
+        }
+        return bucket->h.items[high];
+}
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+        dprintk("choose %d x=%d r=%d\n", in->id, x, r);
+        switch (in->alg) {
+        case CRUSH_BUCKET_UNIFORM:
+                return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+                                          x, r);
+        case CRUSH_BUCKET_LIST:
+                return bucket_list_choose((struct crush_bucket_list *)in,
+                                          x, r);
+        case CRUSH_BUCKET_TREE:
+                return bucket_tree_choose((struct crush_bucket_tree *)in,
+                                          x, r);
+        case CRUSH_BUCKET_STRAW:
+                return bucket_straw_choose((struct crush_bucket_straw *)in,
+                                           x, r);
+        default:
+                BUG_ON(1);
+                return in->items[0];
+        }
+}
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+{
+        if (weight[item] >= 0x1000)
+                return 0;
+        if (weight[item] == 0)
+                return 1;
+        if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+            < weight[item])
+                return 0;
+        return 1;
+}
+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ */
+static int crush_choose(struct crush_map *map,
+                        struct crush_bucket *bucket,
+                        __u32 *weight,
+                        int x, int numrep, int type,
+                        int *out, int outpos,
+                        int firstn, int recurse_to_leaf,
+                        int *out2)
+{
+        int rep;
+        int ftotal, flocal;
+        int retry_descent, retry_bucket, skip_rep;
+        struct crush_bucket *in = bucket;
+        int r;
+        int i;
+        int item = 0;
+        int itemtype;
+        int collide, reject;
+        const int orig_tries = 5; /* attempts before we fall back to search */
+        dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
+        for (rep = outpos; rep < numrep; rep++) {
+                /* keep trying until we get a non-out, non-colliding item */
+                ftotal = 0;
+                skip_rep = 0;
+                do {
+                        retry_descent = 0;
+                        in = bucket;               /* initial bucket */
+                        /* choose through intervening buckets */
+                        flocal = 0;
+                        do {
+                                collide = 0;
+                                retry_bucket = 0;
+                                r = rep;
+                                if (in->alg == CRUSH_BUCKET_UNIFORM) {
+                                        /* be careful */
+                                        if (firstn || numrep >= in->size)
+                                                /* r' = r + f_total */
+                                                r += ftotal;
+                                        else if (in->size % numrep == 0)
+                                                /* r'=r+(n+1)*f_local */
+                                                r += (numrep+1) *
+                                                        (flocal+ftotal);
+                                        else
+                                                /* r' = r + n*f_local */
+                                                r += numrep * (flocal+ftotal);
+                                } else {
+                                        if (firstn)
+                                                /* r' = r + f_total */
+                                                r += ftotal;
+                                        else
+                                                /* r' = r + n*f_local */
+                                                r += numrep * (flocal+ftotal);
+                                }
+                                /* bucket choose */
+                                if (in->size == 0) {
+                                        reject = 1;
+                                        goto reject;
+                                }
+                                if (flocal >= (in->size>>1) &&
+                                    flocal > orig_tries)
+                                        item = bucket_perm_choose(in, x, r);
+                                else
+                                        item = crush_bucket_choose(in, x, r);
+                                BUG_ON(item >= map->max_devices);
+                                /* desired type? */
+                                if (item < 0)
+                                        itemtype = map->buckets[-1-item]->type;
+                                else
+                                        itemtype = 0;
+                                dprintk("  item %d type %d\n", item, itemtype);
+                                /* keep going? */
+                                if (itemtype != type) {
+                                        BUG_ON(item >= 0 ||
+                                               (-1-item) >= map->max_buckets);
+                                        in = map->buckets[-1-item];
+                                        continue;
+                                }
+                                /* collision? */
+                                for (i = 0; i < outpos; i++) {
+                                        if (out[i] == item) {
+                                                collide = 1;
+                                                break;
+                                        }
+                                }
+                                if (recurse_to_leaf &&
+                                    item < 0 &&
+                                    crush_choose(map, map->buckets[-1-item],
+                                                 weight,
+                                                 x, outpos+1, 0,
+                                                 out2, outpos,
+                                                 firstn, 0, NULL) <= outpos) {
+                                        reject = 1;
+                                } else {
+                                        /* out? */
+                                        if (itemtype == 0)
+                                                reject = is_out(map, weight,
+                                                                item, x);
+                                        else
+                                                reject = 0;
+                                }
+reject:
+                                if (reject || collide) {
+                                        ftotal++;
+                                        flocal++;
+                                        if (collide && flocal < 3)
+                                                /* retry locally a few times */
+                                                retry_bucket = 1;
+                                        else if (flocal < in->size + orig_tries)
+                                                /* exhaustive bucket search */
+                                                retry_bucket = 1;
+                                        else if (ftotal < 20)
+                                                /* then retry descent */
+                                                retry_descent = 1;
+                                        else
+                                                /* else give up */
+                                                skip_rep = 1;
+                                        dprintk("  reject %d  collide %d  "
+                                                "ftotal %d  flocal %d\n",
+                                                reject, collide, ftotal,
+                                                flocal);
+                                }
+                        } while (retry_bucket);
+                } while (retry_descent);
+                if (skip_rep) {
+                        dprintk("skip rep\n");
+                        continue;
+                }
+                dprintk("choose got %d\n", item);
+                out[outpos] = item;
+                outpos++;
+        }
+        dprintk("choose returns %d\n", outpos);
+        return outpos;
+}
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @force: force initial replica choice; -1 for none
+ */
+int crush_do_rule(struct crush_map *map,
+                  int ruleno, int x, int *result, int result_max,
+                  int force, __u32 *weight)
+{
+        int result_len;
+        int force_context[CRUSH_MAX_DEPTH];
+        int force_pos = -1;
+        int a[CRUSH_MAX_SET];
+        int b[CRUSH_MAX_SET];
+        int c[CRUSH_MAX_SET];
+        int recurse_to_leaf;
+        int *w;
+        int wsize = 0;
+        int *o;
+        int osize;
+        int *tmp;
+        struct crush_rule *rule;
+        int step;
+        int i, j;
+        int numrep;
+        int firstn;
+        int rc = -1;
+        BUG_ON(ruleno >= map->max_rules);
+        rule = map->rules[ruleno];
+        result_len = 0;
+        w = a;
+        o = b;
+        /*
+         * determine hierarchical context of force, if any.  note
+         * that this may or may not correspond to the specific types
+         * referenced by the crush rule.
+         */
+        if (force >= 0) {
+                if (force >= map->max_devices ||
+                    map->device_parents[force] == 0) {
+                        /*dprintk("CRUSH: forcefed device dne\n");*/
+                        rc = -1;  /* force fed device dne */
+                        goto out;
+                }
+                if (!is_out(map, weight, force, x)) {
+                        while (1) {
+                                force_context[++force_pos] = force;
+                                if (force >= 0)
+                                        force = map->device_parents[force];
+                                else
+                                        force = map->bucket_parents[-1-force];
+                                if (force == 0)
+                                        break;
+                        }
+                }
+        }
+        for (step = 0; step < rule->len; step++) {
+                firstn = 0;
+                switch (rule->steps[step].op) {
+                case CRUSH_RULE_TAKE:
+                        w[0] = rule->steps[step].arg1;
+                        if (force_pos >= 0) {
+                                BUG_ON(force_context[force_pos] != w[0]);
+                                force_pos--;
+                        }
+                        wsize = 1;
+                        break;
+                case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+                case CRUSH_RULE_CHOOSE_FIRSTN:
+                        firstn = 1;
+                case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+                case CRUSH_RULE_CHOOSE_INDEP:
+                        BUG_ON(wsize == 0);
+                        recurse_to_leaf =
+                                rule->steps[step].op ==
+                                 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+                                rule->steps[step].op ==
+                                CRUSH_RULE_CHOOSE_LEAF_INDEP;
+                        /* reset output */
+                        osize = 0;
+                        for (i = 0; i < wsize; i++) {
+                                /*
+                                 * see CRUSH_N, CRUSH_N_MINUS macros.
+                                 * basically, numrep <= 0 means relative to
+                                 * the provided result_max
+                                 */
+                                numrep = rule->steps[step].arg1;
+                                if (numrep <= 0) {
+                                        numrep += result_max;
+                                        if (numrep <= 0)
+                                                continue;
+                                }
+                                j = 0;
+                                if (osize == 0 && force_pos >= 0) {
+                                        /* skip any intermediate types */
+                                        while (force_pos &&
+                                               force_context[force_pos] < 0 &&
+                                               rule->steps[step].arg2 !=
+                                               map->buckets[-1 -
+                                               force_context[force_pos]]->type)
+                                                force_pos--;
+                                        o[osize] = force_context[force_pos];
+                                        if (recurse_to_leaf)
+                                                c[osize] = force_context[0];
+                                        j++;
+                                        force_pos--;
+                                }
+                                osize += crush_choose(map,
+                                                      map->buckets[-1-w[i]],
+                                                      weight,
+                                                      x, numrep,
+                                                      rule->steps[step].arg2,
+                                                      o+osize, j,
+                                                      firstn,
+                                                      recurse_to_leaf, c+osize);
+                        }
+                        if (recurse_to_leaf)
+                                /* copy final _leaf_ values to output set */
+                                memcpy(o, c, osize*sizeof(*o));
+                        /* swap t and w arrays */
+                        tmp = o;
+                        o = w;
+                        w = tmp;
+                        wsize = osize;
+                        break;
+                case CRUSH_RULE_EMIT:
+                        for (i = 0; i < wsize && result_len < result_max; i++) {
+                                result[result_len] = w[i];
+                                result_len++;
+                        }
+                        wsize = 0;
+                        break;
+                default:
+                        BUG_ON(1);
+                }
+        }
+        rc = result_len;
+out:
+        return rc;
+}
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
+#ifndef _CRUSH_MAPPER_H
+#define _CRUSH_MAPPER_H
+/*
+ * CRUSH functions for find rules and then mapping an input to an
+ * output set.
+ *
+ * LGPL2
+ */
+#include "crush.h"
+extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
+extern int crush_do_rule(struct crush_map *map,
+                         int ruleno,
+                         int x, int *result, int result_max,
+                         int forcefeed,    /* -1 for none */
+                         __u32 *weights);
+#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..f704b3b62424
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,409 @@
+#include "ceph_debug.h"
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <crypto/hash.h>
+#include "crypto.h"
+#include "decode.h"
+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
+{
+        if (*p + sizeof(u16) + sizeof(key->created) +
+            sizeof(u16) + key->len > end)
+                return -ERANGE;
+        ceph_encode_16(p, key->type);
+        ceph_encode_copy(p, &key->created, sizeof(key->created));
+        ceph_encode_16(p, key->len);
+        ceph_encode_copy(p, key->key, key->len);
+        return 0;
+}
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
+{
+        ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
+        key->type = ceph_decode_16(p);
+        ceph_decode_copy(p, &key->created, sizeof(key->created));
+        key->len = ceph_decode_16(p);
+        ceph_decode_need(p, end, key->len, bad);
+        key->key = kmalloc(key->len, GFP_NOFS);
+        if (!key->key)
+                return -ENOMEM;
+        ceph_decode_copy(p, key->key, key->len);
+        return 0;
+bad:
+        dout("failed to decode crypto key\n");
+        return -EINVAL;
+}
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
+{
+        int inlen = strlen(inkey);
+        int blen = inlen * 3 / 4;
+        void *buf, *p;
+        int ret;
+        dout("crypto_key_unarmor %s\n", inkey);
+        buf = kmalloc(blen, GFP_NOFS);
+        if (!buf)
+                return -ENOMEM;
+        blen = ceph_unarmor(buf, inkey, inkey+inlen);
+        if (blen < 0) {
+                kfree(buf);
+                return blen;
+        }
+        p = buf;
+        ret = ceph_crypto_key_decode(key, &p, p + blen);
+        kfree(buf);
+        if (ret)
+                return ret;
+        dout("crypto_key_unarmor key %p type %d len %d\n", key,
+             key->type, key->len);
+        return 0;
+}
+#define AES_KEY_SIZE 16
+static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+{
+        return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+}
+const u8 *aes_iv = "cephsageyudagreg";
+int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+                     const void *src, size_t src_len)
+{
+        struct scatterlist sg_in[2], sg_out[1];
+        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+        struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+        int ret;
+        void *iv;
+        int ivsize;
+        size_t zero_padding = (0x10 - (src_len & 0x0f));
+        char pad[16];
+        if (IS_ERR(tfm))
+                return PTR_ERR(tfm);
+        memset(pad, zero_padding, zero_padding);
+        *dst_len = src_len + zero_padding;
+        crypto_blkcipher_setkey((void *)tfm, key, key_len);
+        sg_init_table(sg_in, 2);
+        sg_set_buf(&sg_in[0], src, src_len);
+        sg_set_buf(&sg_in[1], pad, zero_padding);
+        sg_init_table(sg_out, 1);
+        sg_set_buf(sg_out, dst, *dst_len);
+        iv = crypto_blkcipher_crt(tfm)->iv;
+        ivsize = crypto_blkcipher_ivsize(tfm);
+        memcpy(iv, aes_iv, ivsize);
+        /*
+        print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+                       key, key_len, 1);
+        print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
+                        src, src_len, 1);
+        print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+                        pad, zero_padding, 1);
+        */
+        ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+                                     src_len + zero_padding);
+        crypto_free_blkcipher(tfm);
+        if (ret < 0)
+                pr_err("ceph_aes_crypt failed %d\n", ret);
+        /*
+        print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+                       dst, *dst_len, 1);
+        */
+        return 0;
+}
+int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
+                      const void *src1, size_t src1_len,
+                      const void *src2, size_t src2_len)
+{
+        struct scatterlist sg_in[3], sg_out[1];
+        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+        struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+        int ret;
+        void *iv;
+        int ivsize;
+        size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
+        char pad[16];
+        if (IS_ERR(tfm))
+                return PTR_ERR(tfm);
+        memset(pad, zero_padding, zero_padding);
+        *dst_len = src1_len + src2_len + zero_padding;
+        crypto_blkcipher_setkey((void *)tfm, key, key_len);
+        sg_init_table(sg_in, 3);
+        sg_set_buf(&sg_in[0], src1, src1_len);
+        sg_set_buf(&sg_in[1], src2, src2_len);
+        sg_set_buf(&sg_in[2], pad, zero_padding);
+        sg_init_table(sg_out, 1);
+        sg_set_buf(sg_out, dst, *dst_len);
+        iv = crypto_blkcipher_crt(tfm)->iv;
+        ivsize = crypto_blkcipher_ivsize(tfm);
+        memcpy(iv, aes_iv, ivsize);
+        /*
+        print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
+                       key, key_len, 1);
+        print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
+                        src1, src1_len, 1);
+        print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
+                        src2, src2_len, 1);
+        print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
+                        pad, zero_padding, 1);
+        */
+        ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+                                     src1_len + src2_len + zero_padding);
+        crypto_free_blkcipher(tfm);
+        if (ret < 0)
+                pr_err("ceph_aes_crypt2 failed %d\n", ret);
+        /*
+        print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
+                       dst, *dst_len, 1);
+        */
+        return 0;
+}
+int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+                     const void *src, size_t src_len)
+{
+        struct scatterlist sg_in[1], sg_out[2];
+        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+        struct blkcipher_desc desc = { .tfm = tfm };
+        char pad[16];
+        void *iv;
+        int ivsize;
+        int ret;
+        int last_byte;
+        if (IS_ERR(tfm))
+                return PTR_ERR(tfm);
+        crypto_blkcipher_setkey((void *)tfm, key, key_len);
+        sg_init_table(sg_in, 1);
+        sg_init_table(sg_out, 2);
+        sg_set_buf(sg_in, src, src_len);
+        sg_set_buf(&sg_out[0], dst, *dst_len);
+        sg_set_buf(&sg_out[1], pad, sizeof(pad));
+        iv = crypto_blkcipher_crt(tfm)->iv;
+        ivsize = crypto_blkcipher_ivsize(tfm);
+        memcpy(iv, aes_iv, ivsize);
+        /*
+        print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+                       key, key_len, 1);
+        print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
+                       src, src_len, 1);
+        */
+        ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+        crypto_free_blkcipher(tfm);
+        if (ret < 0) {
+                pr_err("ceph_aes_decrypt failed %d\n", ret);
+                return ret;
+        }
+        if (src_len <= *dst_len)
+                last_byte = ((char *)dst)[src_len - 1];
+        else
+                last_byte = pad[src_len - *dst_len - 1];
+        if (last_byte <= 16 && src_len >= last_byte) {
+                *dst_len = src_len - last_byte;
+        } else {
+                pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+                       last_byte, (int)src_len);
+                return -EPERM;  /* bad padding */
+        }
+        /*
+        print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
+                       dst, *dst_len, 1);
+        */
+        return 0;
+}
+int ceph_aes_decrypt2(const void *key, int key_len,
+                      void *dst1, size_t *dst1_len,
+                      void *dst2, size_t *dst2_len,
+                      const void *src, size_t src_len)
+{
+        struct scatterlist sg_in[1], sg_out[3];
+        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+        struct blkcipher_desc desc = { .tfm = tfm };
+        char pad[16];
+        void *iv;
+        int ivsize;
+        int ret;
+        int last_byte;
+        if (IS_ERR(tfm))
+                return PTR_ERR(tfm);
+        sg_init_table(sg_in, 1);
+        sg_set_buf(sg_in, src, src_len);
+        sg_init_table(sg_out, 3);
+        sg_set_buf(&sg_out[0], dst1, *dst1_len);
+        sg_set_buf(&sg_out[1], dst2, *dst2_len);
+        sg_set_buf(&sg_out[2], pad, sizeof(pad));
+        crypto_blkcipher_setkey((void *)tfm, key, key_len);
+        iv = crypto_blkcipher_crt(tfm)->iv;
+        ivsize = crypto_blkcipher_ivsize(tfm);
+        memcpy(iv, aes_iv, ivsize);
+        /*
+        print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
+                       key, key_len, 1);
+        print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
+                       src, src_len, 1);
+        */
+        ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+        crypto_free_blkcipher(tfm);
+        if (ret < 0) {
+                pr_err("ceph_aes_decrypt failed %d\n", ret);
+                return ret;
+        }
+        if (src_len <= *dst1_len)
+                last_byte = ((char *)dst1)[src_len - 1];
+        else if (src_len <= *dst1_len + *dst2_len)
+                last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
+        else
+                last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
+        if (last_byte <= 16 && src_len >= last_byte) {
+                src_len -= last_byte;
+        } else {
+                pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+                       last_byte, (int)src_len);
+                return -EPERM;  /* bad padding */
+        }
+        if (src_len < *dst1_len) {
+                *dst1_len = src_len;
+                *dst2_len = 0;
+        } else {
+                *dst2_len = src_len - *dst1_len;
+        }
+        /*
+        print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
+                       dst1, *dst1_len, 1);
+        print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
+                       dst2, *dst2_len, 1);
+        */
+        return 0;
+}
+int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+                 const void *src, size_t src_len)
+{
+        switch (secret->type) {
+        case CEPH_CRYPTO_NONE:
+                if (*dst_len < src_len)
+                        return -ERANGE;
+                memcpy(dst, src, src_len);
+                *dst_len = src_len;
+                return 0;
+        case CEPH_CRYPTO_AES:
+                return ceph_aes_decrypt(secret->key, secret->len, dst,
+                                        dst_len, src, src_len);
+        default:
+                return -EINVAL;
+        }
+}
+int ceph_decrypt2(struct ceph_crypto_key *secret,
+                        void *dst1, size_t *dst1_len,
+                        void *dst2, size_t *dst2_len,
+                        const void *src, size_t src_len)
+{
+        size_t t;
+        switch (secret->type) {
+        case CEPH_CRYPTO_NONE:
+                if (*dst1_len + *dst2_len < src_len)
+                        return -ERANGE;
+                t = min(*dst1_len, src_len);
+                memcpy(dst1, src, t);
+                *dst1_len = t;
+                src += t;
+                src_len -= t;
+                if (src_len) {
+                        t = min(*dst2_len, src_len);
+                        memcpy(dst2, src, t);
+                        *dst2_len = t;
+                }
+                return 0;
+        case CEPH_CRYPTO_AES:
+                return ceph_aes_decrypt2(secret->key, secret->len,
+                                         dst1, dst1_len, dst2, dst2_len,
+                                         src, src_len);
+        default:
+                return -EINVAL;
+        }
+}
+int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+                 const void *src, size_t src_len)
+{
+        switch (secret->type) {
+        case CEPH_CRYPTO_NONE:
+                if (*dst_len < src_len)
+                        return -ERANGE;
+                memcpy(dst, src, src_len);
+                *dst_len = src_len;
+                return 0;
+        case CEPH_CRYPTO_AES:
+                return ceph_aes_encrypt(secret->key, secret->len, dst,
+                                        dst_len, src, src_len);
+        default:
+                return -EINVAL;
+        }
+}
+int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+                  const void *src1, size_t src1_len,
+                  const void *src2, size_t src2_len)
+{
+        switch (secret->type) {
+        case CEPH_CRYPTO_NONE:
+                if (*dst_len < src1_len + src2_len)
+                        return -ERANGE;
+                memcpy(dst, src1, src1_len);
+                memcpy(dst + src1_len, src2, src2_len);
+                *dst_len = src1_len + src2_len;
+                return 0;
+        case CEPH_CRYPTO_AES:
+                return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
+                                         src1, src1_len, src2, src2_len);
+        default:
+                return -EINVAL;
+        }
+}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
+#ifndef _FS_CEPH_CRYPTO_H
+#define _FS_CEPH_CRYPTO_H
+#include "types.h"
+#include "buffer.h"
+/*
+ * cryptographic secret
+ */
+struct ceph_crypto_key {
+        int type;
+        struct ceph_timespec created;
+        int len;
+        void *key;
+};
+static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
+{
+        kfree(key->key);
+}
+extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
+                                  void **p, void *end);
+extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
+                                  void **p, void *end);
+extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+/* crypto.c */
+extern int ceph_decrypt(struct ceph_crypto_key *secret,
+                        void *dst, size_t *dst_len,
+                        const void *src, size_t src_len);
+extern int ceph_encrypt(struct ceph_crypto_key *secret,
+                        void *dst, size_t *dst_len,
+                        const void *src, size_t src_len);
+extern int ceph_decrypt2(struct ceph_crypto_key *secret,
+                        void *dst1, size_t *dst1_len,
+                        void *dst2, size_t *dst2_len,
+                        const void *src, size_t src_len);
+extern int ceph_encrypt2(struct ceph_crypto_key *secret,
+                         void *dst, size_t *dst_len,
+                         const void *src1, size_t src1_len,
+                         const void *src2, size_t src2_len);
+/* armor.c */
+extern int ceph_armor(char *dst, const void *src, const void *end);
+extern int ceph_unarmor(void *dst, const char *src, const char *end);
+#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..f7048da92acc
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,484 @@
+#include "ceph_debug.h"
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include "super.h"
+#include "mds_client.h"
+#include "mon_client.h"
+#include "auth.h"
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
+ *      .../osdmap      - current osdmap
+ *      .../mdsmap      - current mdsmap
+ *      .../monmap      - current monmap
+ *      .../osdc        - active osd requests
+ *      .../mdsc        - active mds requests
+ *      .../monc        - mon client state
+ *      .../dentry_lru  - dump contents of dentry lru
+ *      .../caps        - expose cap (reservation) stats
+ *      .../bdi         - symlink to ../../bdi/something
+ */
+static struct dentry *ceph_debugfs_dir;
+static int monmap_show(struct seq_file *s, void *p)
+{
+        int i;
+        struct ceph_client *client = s->private;
+        if (client->monc.monmap == NULL)
+                return 0;
+        seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+        for (i = 0; i < client->monc.monmap->num_mon; i++) {
+                struct ceph_entity_inst *inst =
+                        &client->monc.monmap->mon_inst[i];
+                seq_printf(s, "\t%s%lld\t%s\n",
+                           ENTITY_NAME(inst->name),
+                           pr_addr(&inst->addr.in_addr));
+        }
+        return 0;
+}
+static int mdsmap_show(struct seq_file *s, void *p)
+{
+        int i;
+        struct ceph_client *client = s->private;
+        if (client->mdsc.mdsmap == NULL)
+                return 0;
+        seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
+        seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+        seq_printf(s, "session_timeout %d\n",
+                       client->mdsc.mdsmap->m_session_timeout);
+        seq_printf(s, "session_autoclose %d\n",
+                       client->mdsc.mdsmap->m_session_autoclose);
+        for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+                struct ceph_entity_addr *addr =
+                        &client->mdsc.mdsmap->m_info[i].addr;
+                int state = client->mdsc.mdsmap->m_info[i].state;
+                seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+                               ceph_mds_state_name(state));
+        }
+        return 0;
+}
+static int osdmap_show(struct seq_file *s, void *p)
+{
+        int i;
+        struct ceph_client *client = s->private;
+        struct rb_node *n;
+        if (client->osdc.osdmap == NULL)
+                return 0;
+        seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+        seq_printf(s, "flags%s%s\n",
+                   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+                   " NEARFULL" : "",
+                   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+                   " FULL" : "");
+        for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+                struct ceph_pg_pool_info *pool =
+                        rb_entry(n, struct ceph_pg_pool_info, node);
+                seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+                           pool->id, pool->v.pg_num, pool->pg_num_mask,
+                           pool->v.lpg_num, pool->lpg_num_mask);
+        }
+        for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+                struct ceph_entity_addr *addr =
+                        &client->osdc.osdmap->osd_addr[i];
+                int state = client->osdc.osdmap->osd_state[i];
+                char sb[64];
+                seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+                           i, pr_addr(&addr->in_addr),
+                           ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+                           ceph_osdmap_state_str(sb, sizeof(sb), state));
+        }
+        return 0;
+}
+static int monc_show(struct seq_file *s, void *p)
+{
+        struct ceph_client *client = s->private;
+        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_client *monc = &client->monc;
+        struct rb_node *rp;
+        mutex_lock(&monc->mutex);
+        if (monc->have_mdsmap)
+                seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+        if (monc->have_osdmap)
+                seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+        if (monc->want_next_osdmap)
+                seq_printf(s, "want next osdmap\n");
+        for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
+                req = rb_entry(rp, struct ceph_mon_statfs_request, node);
+                seq_printf(s, "%lld statfs\n", req->tid);
+        }
+        mutex_unlock(&monc->mutex);
+        return 0;
+}
+static int mdsc_show(struct seq_file *s, void *p)
+{
+        struct ceph_client *client = s->private;
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        struct rb_node *rp;
+        int pathlen;
+        u64 pathbase;
+        char *path;
+        mutex_lock(&mdsc->mutex);
+        for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
+                req = rb_entry(rp, struct ceph_mds_request, r_node);
+                if (req->r_request)
+                        seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
+                else
+                        seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+                seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
+                if (req->r_got_unsafe)
+                        seq_printf(s, "\t(unsafe)");
+                else
+                        seq_printf(s, "\t");
+                if (req->r_inode) {
+                        seq_printf(s, " #%llx", ceph_ino(req->r_inode));
+                } else if (req->r_dentry) {
+                        path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+                                                    &pathbase, 0);
+                        spin_lock(&req->r_dentry->d_lock);
+                        seq_printf(s, " #%llx/%.*s (%s)",
+                                   ceph_ino(req->r_dentry->d_parent->d_inode),
+                                   req->r_dentry->d_name.len,
+                                   req->r_dentry->d_name.name,
+                                   path ? path : "");
+                        spin_unlock(&req->r_dentry->d_lock);
+                        kfree(path);
+                } else if (req->r_path1) {
+                        seq_printf(s, " #%llx/%s", req->r_ino1.ino,
+                                   req->r_path1);
+                }
+                if (req->r_old_dentry) {
+                        path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+                                                    &pathbase, 0);
+                        spin_lock(&req->r_old_dentry->d_lock);
+                        seq_printf(s, " #%llx/%.*s (%s)",
+                           ceph_ino(req->r_old_dentry->d_parent->d_inode),
+                                   req->r_old_dentry->d_name.len,
+                                   req->r_old_dentry->d_name.name,
+                                   path ? path : "");
+                        spin_unlock(&req->r_old_dentry->d_lock);
+                        kfree(path);
+                } else if (req->r_path2) {
+                        if (req->r_ino2.ino)
+                                seq_printf(s, " #%llx/%s", req->r_ino2.ino,
+                                           req->r_path2);
+                        else
+                                seq_printf(s, " %s", req->r_path2);
+                }
+                seq_printf(s, "\n");
+        }
+        mutex_unlock(&mdsc->mutex);
+        return 0;
+}
+static int osdc_show(struct seq_file *s, void *pp)
+{
+        struct ceph_client *client = s->private;
+        struct ceph_osd_client *osdc = &client->osdc;
+        struct rb_node *p;
+        mutex_lock(&osdc->request_mutex);
+        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+                struct ceph_osd_request *req;
+                struct ceph_osd_request_head *head;
+                struct ceph_osd_op *op;
+                int num_ops;
+                int opcode, olen;
+                int i;
+                req = rb_entry(p, struct ceph_osd_request, r_node);
+                seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+                           req->r_osd ? req->r_osd->o_osd : -1,
+                           le32_to_cpu(req->r_pgid.pool),
+                           le16_to_cpu(req->r_pgid.ps));
+                head = req->r_request->front.iov_base;
+                op = (void *)(head + 1);
+                num_ops = le16_to_cpu(head->num_ops);
+                olen = le32_to_cpu(head->object_len);
+                seq_printf(s, "%.*s", olen,
+                           (const char *)(head->ops + num_ops));
+                if (req->r_reassert_version.epoch)
+                        seq_printf(s, "\t%u'%llu",
+                           (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+                           le64_to_cpu(req->r_reassert_version.version));
+                else
+                        seq_printf(s, "\t");
+                for (i = 0; i < num_ops; i++) {
+                        opcode = le16_to_cpu(op->op);
+                        seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+                        op++;
+                }
+                seq_printf(s, "\n");
+        }
+        mutex_unlock(&osdc->request_mutex);
+        return 0;
+}
+static int caps_show(struct seq_file *s, void *p)
+{
+        struct ceph_client *client = p;
+        int total, avail, used, reserved, min;
+        ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+        seq_printf(s, "total\t\t%d\n"
+                   "avail\t\t%d\n"
+                   "used\t\t%d\n"
+                   "reserved\t%d\n"
+                   "min\t%d\n",
+                   total, avail, used, reserved, min);
+        return 0;
+}
+static int dentry_lru_show(struct seq_file *s, void *ptr)
+{
+        struct ceph_client *client = s->private;
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_dentry_info *di;
+        spin_lock(&mdsc->dentry_lru_lock);
+        list_for_each_entry(di, &mdsc->dentry_lru, lru) {
+                struct dentry *dentry = di->dentry;
+                seq_printf(s, "%p %p\t%.*s\n",
+                           di, dentry, dentry->d_name.len, dentry->d_name.name);
+        }
+        spin_unlock(&mdsc->dentry_lru_lock);
+        return 0;
+}
+#define DEFINE_SHOW_FUNC(name)                                          \
+static int name##_open(struct inode *inode, struct file *file)          \
+{                                                                       \
+        struct seq_file *sf;                                            \
+        int ret;                                                        \
+                                                                        \
+        ret = single_open(file, name, NULL);                            \
+        sf = file->private_data;                                        \
+        sf->private = inode->i_private;                                 \
+        return ret;                                                     \
+}                                                                       \
+                                                                        \
+static const struct file_operations name##_fops = {                     \
+        .open           = name##_open,                                  \
+        .read           = seq_read,                                     \
+        .llseek         = seq_lseek,                                    \
+        .release        = single_release,                               \
+};
+DEFINE_SHOW_FUNC(monmap_show)
+DEFINE_SHOW_FUNC(mdsmap_show)
+DEFINE_SHOW_FUNC(osdmap_show)
+DEFINE_SHOW_FUNC(monc_show)
+DEFINE_SHOW_FUNC(mdsc_show)
+DEFINE_SHOW_FUNC(osdc_show)
+DEFINE_SHOW_FUNC(dentry_lru_show)
+DEFINE_SHOW_FUNC(caps_show)
+static int congestion_kb_set(void *data, u64 val)
+{
+        struct ceph_client *client = (struct ceph_client *)data;
+        if (client)
+                client->mount_args->congestion_kb = (int)val;
+        return 0;
+}
+static int congestion_kb_get(void *data, u64 *val)
+{
+        struct ceph_client *client = (struct ceph_client *)data;
+        if (client)
+                *val = (u64)client->mount_args->congestion_kb;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
+                        congestion_kb_set, "%llu\n");
+int __init ceph_debugfs_init(void)
+{
+        ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+        if (!ceph_debugfs_dir)
+                return -ENOMEM;
+        return 0;
+}
+void ceph_debugfs_cleanup(void)
+{
+        debugfs_remove(ceph_debugfs_dir);
+}
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+        int ret = 0;
+        char name[80];
+        snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
+                 PR_FSID(&client->fsid), client->monc.auth->global_id);
+        client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+        if (!client->debugfs_dir)
+                goto out;
+        client->monc.debugfs_file = debugfs_create_file("monc",
+                                                      0600,
+                                                      client->debugfs_dir,
+                                                      client,
+                                                      &monc_show_fops);
+        if (!client->monc.debugfs_file)
+                goto out;
+        client->mdsc.debugfs_file = debugfs_create_file("mdsc",
+                                                      0600,
+                                                      client->debugfs_dir,
+                                                      client,
+                                                      &mdsc_show_fops);
+        if (!client->mdsc.debugfs_file)
+                goto out;
+        client->osdc.debugfs_file = debugfs_create_file("osdc",
+                                                      0600,
+                                                      client->debugfs_dir,
+                                                      client,
+                                                      &osdc_show_fops);
+        if (!client->osdc.debugfs_file)
+                goto out;
+        client->debugfs_monmap = debugfs_create_file("monmap",
+                                        0600,
+                                        client->debugfs_dir,
+                                        client,
+                                        &monmap_show_fops);
+        if (!client->debugfs_monmap)
+                goto out;
+        client->debugfs_mdsmap = debugfs_create_file("mdsmap",
+                                        0600,
+                                        client->debugfs_dir,
+                                        client,
+                                        &mdsmap_show_fops);
+        if (!client->debugfs_mdsmap)
+                goto out;
+        client->debugfs_osdmap = debugfs_create_file("osdmap",
+                                        0600,
+                                        client->debugfs_dir,
+                                        client,
+                                        &osdmap_show_fops);
+        if (!client->debugfs_osdmap)
+                goto out;
+        client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+                                        0600,
+                                        client->debugfs_dir,
+                                        client,
+                                        &dentry_lru_show_fops);
+        if (!client->debugfs_dentry_lru)
+                goto out;
+        client->debugfs_caps = debugfs_create_file("caps",
+                                                   0400,
+                                                   client->debugfs_dir,
+                                                   client,
+                                                   &caps_show_fops);
+        if (!client->debugfs_caps)
+                goto out;
+        client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
+                                                   0600,
+                                                   client->debugfs_dir,
+                                                   client,
+                                                   &congestion_kb_fops);
+        if (!client->debugfs_congestion_kb)
+                goto out;
+        sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
+        client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
+                                                     name);
+        return 0;
+out:
+        ceph_debugfs_client_cleanup(client);
+        return ret;
+}
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+        debugfs_remove(client->debugfs_bdi);
+        debugfs_remove(client->debugfs_caps);
+        debugfs_remove(client->debugfs_dentry_lru);
+        debugfs_remove(client->debugfs_osdmap);
+        debugfs_remove(client->debugfs_mdsmap);
+        debugfs_remove(client->debugfs_monmap);
+        debugfs_remove(client->osdc.debugfs_file);
+        debugfs_remove(client->mdsc.debugfs_file);
+        debugfs_remove(client->monc.debugfs_file);
+        debugfs_remove(client->debugfs_congestion_kb);
+        debugfs_remove(client->debugfs_dir);
+}
+#else  // CONFIG_DEBUG_FS
+int __init ceph_debugfs_init(void)
+{
+        return 0;
+}
+void ceph_debugfs_cleanup(void)
+{
+}
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+        return 0;
+}
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+#endif  // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..65b3e022eaf5
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
+#ifndef __CEPH_DECODE_H
+#define __CEPH_DECODE_H
+#include <asm/unaligned.h>
+#include <linux/time.h>
+#include "types.h"
+/*
+ * in all cases,
+ *   void **p     pointer to position pointer
+ *   void *end    pointer to end of buffer (last byte + 1)
+ */
+static inline u64 ceph_decode_64(void **p)
+{
+        u64 v = get_unaligned_le64(*p);
+        *p += sizeof(u64);
+        return v;
+}
+static inline u32 ceph_decode_32(void **p)
+{
+        u32 v = get_unaligned_le32(*p);
+        *p += sizeof(u32);
+        return v;
+}
+static inline u16 ceph_decode_16(void **p)
+{
+        u16 v = get_unaligned_le16(*p);
+        *p += sizeof(u16);
+        return v;
+}
+static inline u8 ceph_decode_8(void **p)
+{
+        u8 v = *(u8 *)*p;
+        (*p)++;
+        return v;
+}
+static inline void ceph_decode_copy(void **p, void *pv, size_t n)
+{
+        memcpy(pv, *p, n);
+        *p += n;
+}
+/*
+ * bounds check input.
+ */
+#define ceph_decode_need(p, end, n, bad)                \
+        do {                                            \
+                if (unlikely(*(p) + (n) > (end)))       \
+                        goto bad;                       \
+        } while (0)
+#define ceph_decode_64_safe(p, end, v, bad)                     \
+        do {                                                    \
+                ceph_decode_need(p, end, sizeof(u64), bad);     \
+                v = ceph_decode_64(p);                          \
+        } while (0)
+#define ceph_decode_32_safe(p, end, v, bad)                     \
+        do {                                                    \
+                ceph_decode_need(p, end, sizeof(u32), bad);     \
+                v = ceph_decode_32(p);                          \
+        } while (0)
+#define ceph_decode_16_safe(p, end, v, bad)                     \
+        do {                                                    \
+                ceph_decode_need(p, end, sizeof(u16), bad);     \
+                v = ceph_decode_16(p);                          \
+        } while (0)
+#define ceph_decode_8_safe(p, end, v, bad)                      \
+        do {                                                    \
+                ceph_decode_need(p, end, sizeof(u8), bad);      \
+                v = ceph_decode_8(p);                           \
+        } while (0)
+#define ceph_decode_copy_safe(p, end, pv, n, bad)               \
+        do {                                                    \
+                ceph_decode_need(p, end, n, bad);               \
+                ceph_decode_copy(p, pv, n);                     \
+        } while (0)
+/*
+ * struct ceph_timespec <-> struct timespec
+ */
+static inline void ceph_decode_timespec(struct timespec *ts,
+                                        const struct ceph_timespec *tv)
+{
+        ts->tv_sec = le32_to_cpu(tv->tv_sec);
+        ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
+}
+static inline void ceph_encode_timespec(struct ceph_timespec *tv,
+                                        const struct timespec *ts)
+{
+        tv->tv_sec = cpu_to_le32(ts->tv_sec);
+        tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
+}
+/*
+ * sockaddr_storage <-> ceph_sockaddr
+ */
+static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+{
+        a->in_addr.ss_family = htons(a->in_addr.ss_family);
+}
+static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+{
+        a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
+        WARN_ON(a->in_addr.ss_family == 512);
+}
+/*
+ * encoders
+ */
+static inline void ceph_encode_64(void **p, u64 v)
+{
+        put_unaligned_le64(v, (__le64 *)*p);
+        *p += sizeof(u64);
+}
+static inline void ceph_encode_32(void **p, u32 v)
+{
+        put_unaligned_le32(v, (__le32 *)*p);
+        *p += sizeof(u32);
+}
+static inline void ceph_encode_16(void **p, u16 v)
+{
+        put_unaligned_le16(v, (__le16 *)*p);
+        *p += sizeof(u16);
+}
+static inline void ceph_encode_8(void **p, u8 v)
+{
+        *(u8 *)*p = v;
+        (*p)++;
+}
+static inline void ceph_encode_copy(void **p, const void *s, int len)
+{
+        memcpy(*p, s, len);
+        *p += len;
+}
+/*
+ * filepath, string encoders
+ */
+static inline void ceph_encode_filepath(void **p, void *end,
+                                        u64 ino, const char *path)
+{
+        u32 len = path ? strlen(path) : 0;
+        BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
+        ceph_encode_8(p, 1);
+        ceph_encode_64(p, ino);
+        ceph_encode_32(p, len);
+        if (len)
+                memcpy(*p, path, len);
+        *p += len;
+}
+static inline void ceph_encode_string(void **p, void *end,
+                                      const char *s, u32 len)
+{
+        BUG_ON(*p + sizeof(len) + len > end);
+        ceph_encode_32(p, len);
+        if (len)
+                memcpy(*p, s, len);
+        *p += len;
+}
+#define ceph_encode_need(p, end, n, bad)                \
+        do {                                            \
+                if (unlikely(*(p) + (n) > (end)))       \
+                        goto bad;                       \
+        } while (0)
+#define ceph_encode_64_safe(p, end, v, bad)                     \
+        do {                                                    \
+                ceph_encode_need(p, end, sizeof(u64), bad);     \
+                ceph_encode_64(p, v);                           \
+        } while (0)
+#define ceph_encode_32_safe(p, end, v, bad)                     \
+        do {                                                    \
+                ceph_encode_need(p, end, sizeof(u32), bad);     \
+                ceph_encode_32(p, v);                   \
+        } while (0)
+#define ceph_encode_16_safe(p, end, v, bad)                     \
+        do {                                                    \
+                ceph_encode_need(p, end, sizeof(u16), bad);     \
+                ceph_encode_16(p, v);                   \
+        } while (0)
+#define ceph_encode_copy_safe(p, end, pv, n, bad)               \
+        do {                                                    \
+                ceph_encode_need(p, end, n, bad);               \
+                ceph_encode_copy(p, pv, n);                     \
+        } while (0)
+#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..650d2db5ed26
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1233 @@
+#include "ceph_debug.h"
+#include <linux/spinlock.h>
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include "super.h"
+/*
+ * Directory operations: readdir, lookup, create, link, unlink,
+ * rename, etc.
+ */
+/*
+ * Ceph MDS operations are specified in terms of a base ino and
+ * relative path.  Thus, the client can specify an operation on a
+ * specific inode (e.g., a getattr due to fstat(2)), or as a path
+ * relative to, say, the root directory.
+ *
+ * Normally, we limit ourselves to strict inode ops (no path component)
+ * or dentry operations (a single path component relative to an ino).  The
+ * exception to this is open_root_dentry(), which will open the mount
+ * point by name.
+ */
+const struct inode_operations ceph_dir_iops;
+const struct file_operations ceph_dir_fops;
+struct dentry_operations ceph_dentry_ops;
+/*
+ * Initialize ceph dentry state.
+ */
+int ceph_init_dentry(struct dentry *dentry)
+{
+        struct ceph_dentry_info *di;
+        if (dentry->d_fsdata)
+                return 0;
+        if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+                dentry->d_op = &ceph_dentry_ops;
+        else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
+                dentry->d_op = &ceph_snapdir_dentry_ops;
+        else
+                dentry->d_op = &ceph_snap_dentry_ops;
+        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
+        if (!di)
+                return -ENOMEM;          /* oh well */
+        spin_lock(&dentry->d_lock);
+        if (dentry->d_fsdata) /* lost a race */
+                goto out_unlock;
+        di->dentry = dentry;
+        di->lease_session = NULL;
+        dentry->d_fsdata = di;
+        dentry->d_time = jiffies;
+        ceph_dentry_lru_add(dentry);
+out_unlock:
+        spin_unlock(&dentry->d_lock);
+        return 0;
+}
+/*
+ * for readdir, we encode the directory frag and offset within that
+ * frag into f_pos.
+ */
+static unsigned fpos_frag(loff_t p)
+{
+        return p >> 32;
+}
+static unsigned fpos_off(loff_t p)
+{
+        return p & 0xffffffff;
+}
+/*
+ * When possible, we try to satisfy a readdir by peeking at the
+ * dcache.  We make this work by carefully ordering dentries on
+ * d_u.d_child when we initially get results back from the MDS, and
+ * falling back to a "normal" sync readdir if any dentries in the dir
+ * are dropped.
+ *
+ * I_COMPLETE tells indicates we have all dentries in the dir.  It is
+ * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
+ * the MDS if/when the directory is modified).
+ */
+static int __dcache_readdir(struct file *filp,
+                            void *dirent, filldir_t filldir)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct ceph_file_info *fi = filp->private_data;
+        struct dentry *parent = filp->f_dentry;
+        struct inode *dir = parent->d_inode;
+        struct list_head *p;
+        struct dentry *dentry, *last;
+        struct ceph_dentry_info *di;
+        int err = 0;
+        /* claim ref on last dentry we returned */
+        last = fi->dentry;
+        fi->dentry = NULL;
+        dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+             last);
+        spin_lock(&dcache_lock);
+        /* start at beginning? */
+        if (filp->f_pos == 2 || (last &&
+                                 filp->f_pos < ceph_dentry(last)->offset)) {
+                if (list_empty(&parent->d_subdirs))
+                        goto out_unlock;
+                p = parent->d_subdirs.prev;
+                dout(" initial p %p/%p\n", p->prev, p->next);
+        } else {
+                p = last->d_u.d_child.prev;
+        }
+more:
+        dentry = list_entry(p, struct dentry, d_u.d_child);
+        di = ceph_dentry(dentry);
+        while (1) {
+                dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+                     parent->d_subdirs.prev, parent->d_subdirs.next);
+                if (p == &parent->d_subdirs) {
+                        fi->at_end = 1;
+                        goto out_unlock;
+                }
+                if (!d_unhashed(dentry) && dentry->d_inode &&
+                    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
+                    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
+                    filp->f_pos <= di->offset)
+                        break;
+                dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
+                     dentry->d_name.len, dentry->d_name.name, di->offset,
+                     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+                     !dentry->d_inode ? " null" : "");
+                p = p->prev;
+                dentry = list_entry(p, struct dentry, d_u.d_child);
+                di = ceph_dentry(dentry);
+        }
+        atomic_inc(&dentry->d_count);
+        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
+        dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+             dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+        filp->f_pos = di->offset;
+        err = filldir(dirent, dentry->d_name.name,
+                      dentry->d_name.len, di->offset,
+                      dentry->d_inode->i_ino,
+                      dentry->d_inode->i_mode >> 12);
+        if (last) {
+                if (err < 0) {
+                        /* remember our position */
+                        fi->dentry = last;
+                        fi->next_offset = di->offset;
+                } else {
+                        dput(last);
+                }
+                last = NULL;
+        }
+        spin_lock(&inode->i_lock);
+        spin_lock(&dcache_lock);
+        last = dentry;
+        if (err < 0)
+                goto out_unlock;
+        p = p->prev;
+        filp->f_pos++;
+        /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+        if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
+                goto more;
+        dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+        err = -EAGAIN;
+out_unlock:
+        spin_unlock(&dcache_lock);
+        if (last) {
+                spin_unlock(&inode->i_lock);
+                dput(last);
+                spin_lock(&inode->i_lock);
+        }
+        return err;
+}
+/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+                            int len)
+{
+        kfree(fi->last_name);
+        fi->last_name = kmalloc(len+1, GFP_NOFS);
+        if (!fi->last_name)
+                return -ENOMEM;
+        memcpy(fi->last_name, name, len);
+        fi->last_name[len] = 0;
+        dout("note_last_dentry '%s'\n", fi->last_name);
+        return 0;
+}
+static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        struct ceph_file_info *fi = filp->private_data;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        unsigned frag = fpos_frag(filp->f_pos);
+        int off = fpos_off(filp->f_pos);
+        int err;
+        u32 ftype;
+        struct ceph_mds_reply_info_parsed *rinfo;
+        const int max_entries = client->mount_args->max_readdir;
+        dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+        if (fi->at_end)
+                return 0;
+        /* always start with . and .. */
+        if (filp->f_pos == 0) {
+                /* note dir version at start of readdir so we can tell
+                 * if any dentries get dropped */
+                fi->dir_release_count = ci->i_release_count;
+                dout("readdir off 0 -> '.'\n");
+                if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+                            inode->i_ino, inode->i_mode >> 12) < 0)
+                        return 0;
+                filp->f_pos = 1;
+                off = 1;
+        }
+        if (filp->f_pos == 1) {
+                dout("readdir off 1 -> '..'\n");
+                if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+                            filp->f_dentry->d_parent->d_inode->i_ino,
+                            inode->i_mode >> 12) < 0)
+                        return 0;
+                filp->f_pos = 2;
+                off = 2;
+        }
+        /* can we use the dcache? */
+        spin_lock(&inode->i_lock);
+        if ((filp->f_pos == 2 || fi->dentry) &&
+            !ceph_test_opt(client, NOASYNCREADDIR) &&
+            (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+            __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+                err = __dcache_readdir(filp, dirent, filldir);
+                if (err != -EAGAIN) {
+                        spin_unlock(&inode->i_lock);
+                        return err;
+                }
+        }
+        spin_unlock(&inode->i_lock);
+        if (fi->dentry) {
+                err = note_last_dentry(fi, fi->dentry->d_name.name,
+                                       fi->dentry->d_name.len);
+                if (err)
+                        return err;
+                dput(fi->dentry);
+                fi->dentry = NULL;
+        }
+        /* proceed with a normal readdir */
+more:
+        /* do we have the correct frag content buffered? */
+        if (fi->frag != frag || fi->last_readdir == NULL) {
+                struct ceph_mds_request *req;
+                int op = ceph_snap(inode) == CEPH_SNAPDIR ?
+                        CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
+                /* discard old result, if any */
+                if (fi->last_readdir) {
+                        ceph_mdsc_put_request(fi->last_readdir);
+                        fi->last_readdir = NULL;
+                }
+                /* requery frag tree, as the frag topology may have changed */
+                frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
+                dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
+                     ceph_vinop(inode), frag, fi->last_name);
+                req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+                if (IS_ERR(req))
+                        return PTR_ERR(req);
+                req->r_inode = igrab(inode);
+                req->r_dentry = dget(filp->f_dentry);
+                /* hints to request -> mds selection code */
+                req->r_direct_mode = USE_AUTH_MDS;
+                req->r_direct_hash = ceph_frag_value(frag);
+                req->r_direct_is_hash = true;
+                req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+                req->r_readdir_offset = fi->next_offset;
+                req->r_args.readdir.frag = cpu_to_le32(frag);
+                req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+                req->r_num_caps = max_entries + 1;
+                err = ceph_mdsc_do_request(mdsc, NULL, req);
+                if (err < 0) {
+                        ceph_mdsc_put_request(req);
+                        return err;
+                }
+                dout("readdir got and parsed readdir result=%d"
+                     " on frag %x, end=%d, complete=%d\n", err, frag,
+                     (int)req->r_reply_info.dir_end,
+                     (int)req->r_reply_info.dir_complete);
+                if (!req->r_did_prepopulate) {
+                        dout("readdir !did_prepopulate");
+                        fi->dir_release_count--;    /* preclude I_COMPLETE */
+                }
+                /* note next offset and last dentry name */
+                fi->offset = fi->next_offset;
+                fi->last_readdir = req;
+                if (req->r_reply_info.dir_end) {
+                        kfree(fi->last_name);
+                        fi->last_name = NULL;
+                        fi->next_offset = 0;
+                } else {
+                        rinfo = &req->r_reply_info;
+                        err = note_last_dentry(fi,
+                                       rinfo->dir_dname[rinfo->dir_nr-1],
+                                       rinfo->dir_dname_len[rinfo->dir_nr-1]);
+                        if (err)
+                                return err;
+                        fi->next_offset += rinfo->dir_nr;
+                }
+        }
+        rinfo = &fi->last_readdir->r_reply_info;
+        dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
+             rinfo->dir_nr, off, fi->offset);
+        while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
+                u64 pos = ceph_make_fpos(frag, off);
+                struct ceph_mds_reply_inode *in =
+                        rinfo->dir_in[off - fi->offset].in;
+                dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
+                     off, off - fi->offset, rinfo->dir_nr, pos,
+                     rinfo->dir_dname_len[off - fi->offset],
+                     rinfo->dir_dname[off - fi->offset], in);
+                BUG_ON(!in);
+                ftype = le32_to_cpu(in->mode) >> 12;
+                if (filldir(dirent,
+                            rinfo->dir_dname[off - fi->offset],
+                            rinfo->dir_dname_len[off - fi->offset],
+                            pos,
+                            le64_to_cpu(in->ino),
+                            ftype) < 0) {
+                        dout("filldir stopping us...\n");
+                        return 0;
+                }
+                off++;
+                filp->f_pos = pos + 1;
+        }
+        if (fi->last_name) {
+                ceph_mdsc_put_request(fi->last_readdir);
+                fi->last_readdir = NULL;
+                goto more;
+        }
+        /* more frags? */
+        if (!ceph_frag_is_rightmost(frag)) {
+                frag = ceph_frag_next(frag);
+                off = 0;
+                filp->f_pos = ceph_make_fpos(frag, off);
+                dout("readdir next frag is %x\n", frag);
+                goto more;
+        }
+        fi->at_end = 1;
+        /*
+         * if dir_release_count still matches the dir, no dentries
+         * were released during the whole readdir, and we should have
+         * the complete dir contents in our cache.
+         */
+        spin_lock(&inode->i_lock);
+        if (ci->i_release_count == fi->dir_release_count) {
+                dout(" marking %p complete\n", inode);
+                ci->i_ceph_flags |= CEPH_I_COMPLETE;
+                ci->i_max_offset = filp->f_pos;
+        }
+        spin_unlock(&inode->i_lock);
+        dout("readdir %p filp %p done.\n", inode, filp);
+        return 0;
+}
+static void reset_readdir(struct ceph_file_info *fi)
+{
+        if (fi->last_readdir) {
+                ceph_mdsc_put_request(fi->last_readdir);
+                fi->last_readdir = NULL;
+        }
+        kfree(fi->last_name);
+        fi->next_offset = 2;  /* compensate for . and .. */
+        if (fi->dentry) {
+                dput(fi->dentry);
+                fi->dentry = NULL;
+        }
+        fi->at_end = 0;
+}
+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct ceph_file_info *fi = file->private_data;
+        struct inode *inode = file->f_mapping->host;
+        loff_t old_offset = offset;
+        loff_t retval;
+        mutex_lock(&inode->i_mutex);
+        switch (origin) {
+        case SEEK_END:
+                offset += inode->i_size + 2;   /* FIXME */
+                break;
+        case SEEK_CUR:
+                offset += file->f_pos;
+        }
+        retval = -EINVAL;
+        if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+                if (offset != file->f_pos) {
+                        file->f_pos = offset;
+                        file->f_version = 0;
+                        fi->at_end = 0;
+                }
+                retval = offset;
+                /*
+                 * discard buffered readdir content on seekdir(0), or
+                 * seek to new frag, or seek prior to current chunk.
+                 */
+                if (offset == 0 ||
+                    fpos_frag(offset) != fpos_frag(old_offset) ||
+                    fpos_off(offset) < fi->offset) {
+                        dout("dir_llseek dropping %p content\n", file);
+                        reset_readdir(fi);
+                }
+                /* bump dir_release_count if we did a forward seek */
+                if (offset > old_offset)
+                        fi->dir_release_count--;
+        }
+        mutex_unlock(&inode->i_mutex);
+        return retval;
+}
+/*
+ * Process result of a lookup/open request.
+ *
+ * Mainly, make sure we return the final req->r_dentry (if it already
+ * existed) in place of the original VFS-provided dentry when they
+ * differ.
+ *
+ * Gracefully handle the case where the MDS replies with -ENOENT and
+ * no trace (which it may do, at its discretion, e.g., if it doesn't
+ * care to issue a lease on the negative dentry).
+ */
+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+                                  struct dentry *dentry, int err)
+{
+        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct inode *parent = dentry->d_parent->d_inode;
+        /* .snap dir? */
+        if (err == -ENOENT &&
+            ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
+            strcmp(dentry->d_name.name,
+                   client->mount_args->snapdir_name) == 0) {
+                struct inode *inode = ceph_get_snapdir(parent);
+                dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
+                     dentry, dentry->d_name.len, dentry->d_name.name, inode);
+                BUG_ON(!d_unhashed(dentry));
+                d_add(dentry, inode);
+                err = 0;
+        }
+        if (err == -ENOENT) {
+                /* no trace? */
+                err = 0;
+                if (!req->r_reply_info.head->is_dentry) {
+                        dout("ENOENT and no trace, dentry %p inode %p\n",
+                             dentry, dentry->d_inode);
+                        if (dentry->d_inode) {
+                                d_drop(dentry);
+                                err = -ENOENT;
+                        } else {
+                                d_add(dentry, NULL);
+                        }
+                }
+        }
+        if (err)
+                dentry = ERR_PTR(err);
+        else if (dentry != req->r_dentry)
+                dentry = dget(req->r_dentry);   /* we got spliced */
+        else
+                dentry = NULL;
+        return dentry;
+}
+static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+{
+        return ceph_ino(inode) == CEPH_INO_ROOT &&
+                strncmp(dentry->d_name.name, ".ceph", 5) == 0;
+}
+/*
+ * Look up a single dir entry.  If there is a lookup intent, inform
+ * the MDS so that it gets our 'caps wanted' value in a single op.
+ */
+static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
+                                  struct nameidata *nd)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int op;
+        int err;
+        dout("lookup %p dentry %p '%.*s'\n",
+             dir, dentry, dentry->d_name.len, dentry->d_name.name);
+        if (dentry->d_name.len > NAME_MAX)
+                return ERR_PTR(-ENAMETOOLONG);
+        err = ceph_init_dentry(dentry);
+        if (err < 0)
+                return ERR_PTR(err);
+        /* open (but not create!) intent? */
+        if (nd &&
+            (nd->flags & LOOKUP_OPEN) &&
+            (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
+            !(nd->intent.open.flags & O_CREAT)) {
+                int mode = nd->intent.open.create_mode & ~current->fs->umask;
+                return ceph_lookup_open(dir, dentry, nd, mode, 1);
+        }
+        /* can we conclude ENOENT locally? */
+        if (dentry->d_inode == NULL) {
+                struct ceph_inode_info *ci = ceph_inode(dir);
+                struct ceph_dentry_info *di = ceph_dentry(dentry);
+                spin_lock(&dir->i_lock);
+                dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+                if (strncmp(dentry->d_name.name,
+                            client->mount_args->snapdir_name,
+                            dentry->d_name.len) &&
+                    !is_root_ceph_dentry(dir, dentry) &&
+                    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+                    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+                        di->offset = ci->i_max_offset++;
+                        spin_unlock(&dir->i_lock);
+                        dout(" dir %p complete, -ENOENT\n", dir);
+                        d_add(dentry, NULL);
+                        di->lease_shared_gen = ci->i_shared_gen;
+                        return NULL;
+                }
+                spin_unlock(&dir->i_lock);
+        }
+        op = ceph_snap(dir) == CEPH_SNAPDIR ?
+                CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+        req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+        if (IS_ERR(req))
+                return ERR_PTR(PTR_ERR(req));
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        /* we only need inode linkage */
+        req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+        req->r_locked_dir = dir;
+        err = ceph_mdsc_do_request(mdsc, NULL, req);
+        dentry = ceph_finish_lookup(req, dentry, err);
+        ceph_mdsc_put_request(req);  /* will dput(dentry) */
+        dout("lookup result=%p\n", dentry);
+        return dentry;
+}
+/*
+ * If we do a create but get no trace back from the MDS, follow up with
+ * a lookup (the VFS expects us to link up the provided dentry).
+ */
+int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
+{
+        struct dentry *result = ceph_lookup(dir, dentry, NULL);
+        if (result && !IS_ERR(result)) {
+                /*
+                 * We created the item, then did a lookup, and found
+                 * it was already linked to another inode we already
+                 * had in our cache (and thus got spliced).  Link our
+                 * dentry to that inode, but don't hash it, just in
+                 * case the VFS wants to dereference it.
+                 */
+                BUG_ON(!result->d_inode);
+                d_instantiate(dentry, result->d_inode);
+                return 0;
+        }
+        return PTR_ERR(result);
+}
+static int ceph_mknod(struct inode *dir, struct dentry *dentry,
+                      int mode, dev_t rdev)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int err;
+        if (ceph_snap(dir) != CEPH_NOSNAP)
+                return -EROFS;
+        dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
+             dir, dentry, mode, rdev);
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
+        if (IS_ERR(req)) {
+                d_drop(dentry);
+                return PTR_ERR(req);
+        }
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        req->r_locked_dir = dir;
+        req->r_args.mknod.mode = cpu_to_le32(mode);
+        req->r_args.mknod.rdev = cpu_to_le32(rdev);
+        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        err = ceph_mdsc_do_request(mdsc, dir, req);
+        if (!err && !req->r_reply_info.head->is_dentry)
+                err = ceph_handle_notrace_create(dir, dentry);
+        ceph_mdsc_put_request(req);
+        if (err)
+                d_drop(dentry);
+        return err;
+}
+static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
+                       struct nameidata *nd)
+{
+        dout("create in dir %p dentry %p name '%.*s'\n",
+             dir, dentry, dentry->d_name.len, dentry->d_name.name);
+        if (ceph_snap(dir) != CEPH_NOSNAP)
+                return -EROFS;
+        if (nd) {
+                BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
+                dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
+                /* hrm, what should i do here if we get aliased? */
+                if (IS_ERR(dentry))
+                        return PTR_ERR(dentry);
+                return 0;
+        }
+        /* fall back to mknod */
+        return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
+}
+static int ceph_symlink(struct inode *dir, struct dentry *dentry,
+                            const char *dest)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int err;
+        if (ceph_snap(dir) != CEPH_NOSNAP)
+                return -EROFS;
+        dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
+        if (IS_ERR(req)) {
+                d_drop(dentry);
+                return PTR_ERR(req);
+        }
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        req->r_path2 = kstrdup(dest, GFP_NOFS);
+        req->r_locked_dir = dir;
+        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        err = ceph_mdsc_do_request(mdsc, dir, req);
+        if (!err && !req->r_reply_info.head->is_dentry)
+                err = ceph_handle_notrace_create(dir, dentry);
+        ceph_mdsc_put_request(req);
+        if (err)
+                d_drop(dentry);
+        return err;
+}
+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int err = -EROFS;
+        int op;
+        if (ceph_snap(dir) == CEPH_SNAPDIR) {
+                /* mkdir .snap/foo is a MKSNAP */
+                op = CEPH_MDS_OP_MKSNAP;
+                dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
+                     dentry->d_name.len, dentry->d_name.name, dentry);
+        } else if (ceph_snap(dir) == CEPH_NOSNAP) {
+                dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
+                op = CEPH_MDS_OP_MKDIR;
+        } else {
+                goto out;
+        }
+        req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                goto out;
+        }
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        req->r_locked_dir = dir;
+        req->r_args.mkdir.mode = cpu_to_le32(mode);
+        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        err = ceph_mdsc_do_request(mdsc, dir, req);
+        if (!err && !req->r_reply_info.head->is_dentry)
+                err = ceph_handle_notrace_create(dir, dentry);
+        ceph_mdsc_put_request(req);
+out:
+        if (err < 0)
+                d_drop(dentry);
+        return err;
+}
+static int ceph_link(struct dentry *old_dentry, struct inode *dir,
+                     struct dentry *dentry)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int err;
+        if (ceph_snap(dir) != CEPH_NOSNAP)
+                return -EROFS;
+        dout("link in dir %p old_dentry %p dentry %p\n", dir,
+             old_dentry, dentry);
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
+        if (IS_ERR(req)) {
+                d_drop(dentry);
+                return PTR_ERR(req);
+        }
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
+        req->r_locked_dir = dir;
+        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        err = ceph_mdsc_do_request(mdsc, dir, req);
+        if (err)
+                d_drop(dentry);
+        else if (!req->r_reply_info.head->is_dentry)
+                d_instantiate(dentry, igrab(old_dentry->d_inode));
+        ceph_mdsc_put_request(req);
+        return err;
+}
+/*
+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps.  If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+static int drop_caps_for_unlink(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+        spin_lock(&inode->i_lock);
+        if (inode->i_nlink == 1) {
+                drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+                ci->i_ceph_flags |= CEPH_I_NODELAY;
+        }
+        spin_unlock(&inode->i_lock);
+        return drop;
+}
+/*
+ * rmdir and unlink are differ only by the metadata op code
+ */
+static int ceph_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct inode *inode = dentry->d_inode;
+        struct ceph_mds_request *req;
+        int err = -EROFS;
+        int op;
+        if (ceph_snap(dir) == CEPH_SNAPDIR) {
+                /* rmdir .snap/foo is RMSNAP */
+                dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
+                     dentry->d_name.name, dentry);
+                op = CEPH_MDS_OP_RMSNAP;
+        } else if (ceph_snap(dir) == CEPH_NOSNAP) {
+                dout("unlink/rmdir dir %p dn %p inode %p\n",
+                     dir, dentry, inode);
+                op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
+                        CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
+        } else
+                goto out;
+        req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                goto out;
+        }
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        req->r_locked_dir = dir;
+        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        req->r_inode_drop = drop_caps_for_unlink(inode);
+        err = ceph_mdsc_do_request(mdsc, dir, req);
+        if (!err && !req->r_reply_info.head->is_dentry)
+                d_delete(dentry);
+        ceph_mdsc_put_request(req);
+out:
+        return err;
+}
+static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
+                       struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int err;
+        if (ceph_snap(old_dir) != ceph_snap(new_dir))
+                return -EXDEV;
+        if (ceph_snap(old_dir) != CEPH_NOSNAP ||
+            ceph_snap(new_dir) != CEPH_NOSNAP)
+                return -EROFS;
+        dout("rename dir %p dentry %p to dir %p dentry %p\n",
+             old_dir, old_dentry, new_dir, new_dentry);
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->r_dentry = dget(new_dentry);
+        req->r_num_caps = 2;
+        req->r_old_dentry = dget(old_dentry);
+        req->r_locked_dir = new_dir;
+        req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
+        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        /* release LINK_RDCACHE on source inode (mds will lock it) */
+        req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
+        if (new_dentry->d_inode)
+                req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
+        err = ceph_mdsc_do_request(mdsc, old_dir, req);
+        if (!err && !req->r_reply_info.head->is_dentry) {
+                /*
+                 * Normally d_move() is done by fill_trace (called by
+                 * do_request, above).  If there is no trace, we need
+                 * to do it here.
+                 */
+                /* d_move screws up d_subdirs order */
+                ceph_i_clear(new_dir, CEPH_I_COMPLETE);
+                d_move(old_dentry, new_dentry);
+                /* ensure target dentry is invalidated, despite
+                   rehashing bug in vfs_rename_dir */
+                new_dentry->d_time = jiffies;
+                ceph_dentry(new_dentry)->lease_shared_gen = 0;
+        }
+        ceph_mdsc_put_request(req);
+        return err;
+}
+/*
+ * Check if dentry lease is valid.  If not, delete the lease.  Try to
+ * renew if the least is more than half up.
+ */
+static int dentry_lease_is_valid(struct dentry *dentry)
+{
+        struct ceph_dentry_info *di;
+        struct ceph_mds_session *s;
+        int valid = 0;
+        u32 gen;
+        unsigned long ttl;
+        struct ceph_mds_session *session = NULL;
+        struct inode *dir = NULL;
+        u32 seq = 0;
+        spin_lock(&dentry->d_lock);
+        di = ceph_dentry(dentry);
+        if (di && di->lease_session) {
+                s = di->lease_session;
+                spin_lock(&s->s_cap_lock);
+                gen = s->s_cap_gen;
+                ttl = s->s_cap_ttl;
+                spin_unlock(&s->s_cap_lock);
+                if (di->lease_gen == gen &&
+                    time_before(jiffies, dentry->d_time) &&
+                    time_before(jiffies, ttl)) {
+                        valid = 1;
+                        if (di->lease_renew_after &&
+                            time_after(jiffies, di->lease_renew_after)) {
+                                /* we should renew */
+                                dir = dentry->d_parent->d_inode;
+                                session = ceph_get_mds_session(s);
+                                seq = di->lease_seq;
+                                di->lease_renew_after = 0;
+                                di->lease_renew_from = jiffies;
+                        }
+                }
+        }
+        spin_unlock(&dentry->d_lock);
+        if (session) {
+                ceph_mdsc_lease_send_msg(session, dir, dentry,
+                                         CEPH_MDS_LEASE_RENEW, seq);
+                ceph_put_mds_session(session);
+        }
+        dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+        return valid;
+}
+/*
+ * Check if directory-wide content lease/cap is valid.
+ */
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+{
+        struct ceph_inode_info *ci = ceph_inode(dir);
+        struct ceph_dentry_info *di = ceph_dentry(dentry);
+        int valid = 0;
+        spin_lock(&dir->i_lock);
+        if (ci->i_shared_gen == di->lease_shared_gen)
+                valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+        spin_unlock(&dir->i_lock);
+        dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
+             dir, (unsigned)ci->i_shared_gen, dentry,
+             (unsigned)di->lease_shared_gen, valid);
+        return valid;
+}
+/*
+ * Check if cached dentry can be trusted.
+ */
+static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        struct inode *dir = dentry->d_parent->d_inode;
+        dout("d_revalidate %p '%.*s' inode %p\n", dentry,
+             dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+        /* always trust cached snapped dentries, snapdir dentry */
+        if (ceph_snap(dir) != CEPH_NOSNAP) {
+                dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
+                     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+                goto out_touch;
+        }
+        if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
+                goto out_touch;
+        if (dentry_lease_is_valid(dentry) ||
+            dir_lease_is_valid(dir, dentry))
+                goto out_touch;
+        dout("d_revalidate %p invalid\n", dentry);
+        d_drop(dentry);
+        return 0;
+out_touch:
+        ceph_dentry_lru_touch(dentry);
+        return 1;
+}
+/*
+ * When a dentry is released, clear the dir I_COMPLETE if it was part
+ * of the current dir gen.
+ */
+static void ceph_dentry_release(struct dentry *dentry)
+{
+        struct ceph_dentry_info *di = ceph_dentry(dentry);
+        struct inode *parent_inode = dentry->d_parent->d_inode;
+        if (parent_inode) {
+                struct ceph_inode_info *ci = ceph_inode(parent_inode);
+                spin_lock(&parent_inode->i_lock);
+                if (ci->i_shared_gen == di->lease_shared_gen) {
+                        dout(" clearing %p complete (d_release)\n",
+                             parent_inode);
+                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+                        ci->i_release_count++;
+                }
+                spin_unlock(&parent_inode->i_lock);
+        }
+        if (di) {
+                ceph_dentry_lru_del(dentry);
+                if (di->lease_session)
+                        ceph_put_mds_session(di->lease_session);
+                kmem_cache_free(ceph_dentry_cachep, di);
+                dentry->d_fsdata = NULL;
+        }
+}
+static int ceph_snapdir_d_revalidate(struct dentry *dentry,
+                                          struct nameidata *nd)
+{
+        /*
+         * Eventually, we'll want to revalidate snapped metadata
+         * too... probably...
+         */
+        return 1;
+}
+/*
+ * read() on a dir.  This weird interface hack only works if mounted
+ * with '-o dirstat'.
+ */
+static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
+                             loff_t *ppos)
+{
+        struct ceph_file_info *cf = file->private_data;
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int left;
+        if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+                return -EISDIR;
+        if (!cf->dir_info) {
+                cf->dir_info = kmalloc(1024, GFP_NOFS);
+                if (!cf->dir_info)
+                        return -ENOMEM;
+                cf->dir_info_len =
+                        sprintf(cf->dir_info,
+                                "entries:   %20lld\n"
+                                " files:    %20lld\n"
+                                " subdirs:  %20lld\n"
+                                "rentries:  %20lld\n"
+                                " rfiles:   %20lld\n"
+                                " rsubdirs: %20lld\n"
+                                "rbytes:    %20lld\n"
+                                "rctime:    %10ld.%09ld\n",
+                                ci->i_files + ci->i_subdirs,
+                                ci->i_files,
+                                ci->i_subdirs,
+                                ci->i_rfiles + ci->i_rsubdirs,
+                                ci->i_rfiles,
+                                ci->i_rsubdirs,
+                                ci->i_rbytes,
+                                (long)ci->i_rctime.tv_sec,
+                                (long)ci->i_rctime.tv_nsec);
+        }
+        if (*ppos >= cf->dir_info_len)
+                return 0;
+        size = min_t(unsigned, size, cf->dir_info_len-*ppos);
+        left = copy_to_user(buf, cf->dir_info + *ppos, size);
+        if (left == size)
+                return -EFAULT;
+        *ppos += (size - left);
+        return size - left;
+}
+/*
+ * an fsync() on a dir will wait for any uncommitted directory
+ * operations to commit.
+ */
+static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
+                          int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct list_head *head = &ci->i_unsafe_dirops;
+        struct ceph_mds_request *req;
+        u64 last_tid;
+        int ret = 0;
+        dout("dir_fsync %p\n", inode);
+        spin_lock(&ci->i_unsafe_lock);
+        if (list_empty(head))
+                goto out;
+        req = list_entry(head->prev,
+                         struct ceph_mds_request, r_unsafe_dir_item);
+        last_tid = req->r_tid;
+        do {
+                ceph_mdsc_get_request(req);
+                spin_unlock(&ci->i_unsafe_lock);
+                dout("dir_fsync %p wait on tid %llu (until %llu)\n",
+                     inode, req->r_tid, last_tid);
+                if (req->r_timeout) {
+                        ret = wait_for_completion_timeout(
+                                &req->r_safe_completion, req->r_timeout);
+                        if (ret > 0)
+                                ret = 0;
+                        else if (ret == 0)
+                                ret = -EIO;  /* timed out */
+                } else {
+                        wait_for_completion(&req->r_safe_completion);
+                }
+                spin_lock(&ci->i_unsafe_lock);
+                ceph_mdsc_put_request(req);
+                if (ret || list_empty(head))
+                        break;
+                req = list_entry(head->next,
+                                 struct ceph_mds_request, r_unsafe_dir_item);
+        } while (req->r_tid < last_tid);
+out:
+        spin_unlock(&ci->i_unsafe_lock);
+        return ret;
+}
+/*
+ * We maintain a private dentry LRU.
+ *
+ * FIXME: this needs to be changed to a per-mds lru to be useful.
+ */
+void ceph_dentry_lru_add(struct dentry *dn)
+{
+        struct ceph_dentry_info *di = ceph_dentry(dn);
+        struct ceph_mds_client *mdsc;
+        dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
+             dn->d_name.len, dn->d_name.name);
+        if (di) {
+                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                spin_lock(&mdsc->dentry_lru_lock);
+                list_add_tail(&di->lru, &mdsc->dentry_lru);
+                mdsc->num_dentry++;
+                spin_unlock(&mdsc->dentry_lru_lock);
+        }
+}
+void ceph_dentry_lru_touch(struct dentry *dn)
+{
+        struct ceph_dentry_info *di = ceph_dentry(dn);
+        struct ceph_mds_client *mdsc;
+        dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
+             dn->d_name.len, dn->d_name.name);
+        if (di) {
+                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                spin_lock(&mdsc->dentry_lru_lock);
+                list_move_tail(&di->lru, &mdsc->dentry_lru);
+                spin_unlock(&mdsc->dentry_lru_lock);
+        }
+}
+void ceph_dentry_lru_del(struct dentry *dn)
+{
+        struct ceph_dentry_info *di = ceph_dentry(dn);
+        struct ceph_mds_client *mdsc;
+        dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
+             dn->d_name.len, dn->d_name.name);
+        if (di) {
+                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                spin_lock(&mdsc->dentry_lru_lock);
+                list_del_init(&di->lru);
+                mdsc->num_dentry--;
+                spin_unlock(&mdsc->dentry_lru_lock);
+        }
+}
+const struct file_operations ceph_dir_fops = {
+        .read = ceph_read_dir,
+        .readdir = ceph_readdir,
+        .llseek = ceph_dir_llseek,
+        .open = ceph_open,
+        .release = ceph_release,
+        .unlocked_ioctl = ceph_ioctl,
+        .fsync = ceph_dir_fsync,
+};
+const struct inode_operations ceph_dir_iops = {
+        .lookup = ceph_lookup,
+        .permission = ceph_permission,
+        .getattr = ceph_getattr,
+        .setattr = ceph_setattr,
+        .setxattr = ceph_setxattr,
+        .getxattr = ceph_getxattr,
+        .listxattr = ceph_listxattr,
+        .removexattr = ceph_removexattr,
+        .mknod = ceph_mknod,
+        .symlink = ceph_symlink,
+        .mkdir = ceph_mkdir,
+        .link = ceph_link,
+        .unlink = ceph_unlink,
+        .rmdir = ceph_unlink,
+        .rename = ceph_rename,
+        .create = ceph_create,
+};
+struct dentry_operations ceph_dentry_ops = {
+        .d_revalidate = ceph_d_revalidate,
+        .d_release = ceph_dentry_release,
+};
+struct dentry_operations ceph_snapdir_dentry_ops = {
+        .d_revalidate = ceph_snapdir_d_revalidate,
+};
+struct dentry_operations ceph_snap_dentry_ops = {
+};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..9d67572fb328
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,224 @@
+#include "ceph_debug.h"
+#include <linux/exportfs.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include "super.h"
+/*
+ * NFS export support
+ *
+ * NFS re-export of a ceph mount is, at present, only semireliable.
+ * The basic issue is that the Ceph architectures doesn't lend itself
+ * well to generating filehandles that will remain valid forever.
+ *
+ * So, we do our best.  If you're lucky, your inode will be in the
+ * client's cache.  If it's not, and you have a connectable fh, then
+ * the MDS server may be able to find it for you.  Otherwise, you get
+ * ESTALE.
+ *
+ * There are ways to this more reliable, but in the non-connectable fh
+ * case, we won't every work perfectly, and in the connectable case,
+ * some changes are needed on the MDS side to work better.
+ */
+/*
+ * Basic fh
+ */
+struct ceph_nfs_fh {
+        u64 ino;
+} __attribute__ ((packed));
+/*
+ * Larger 'connectable' fh that includes parent ino and name hash.
+ * Use this whenever possible, as it works more reliably.
+ */
+struct ceph_nfs_confh {
+        u64 ino, parent_ino;
+        u32 parent_name_hash;
+} __attribute__ ((packed));
+static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
+                          int connectable)
+{
+        struct ceph_nfs_fh *fh = (void *)rawfh;
+        struct ceph_nfs_confh *cfh = (void *)rawfh;
+        struct dentry *parent = dentry->d_parent;
+        struct inode *inode = dentry->d_inode;
+        int type;
+        /* don't re-export snaps */
+        if (ceph_snap(inode) != CEPH_NOSNAP)
+                return -EINVAL;
+        if (*max_len >= sizeof(*cfh)) {
+                dout("encode_fh %p connectable\n", dentry);
+                cfh->ino = ceph_ino(dentry->d_inode);
+                cfh->parent_ino = ceph_ino(parent->d_inode);
+                cfh->parent_name_hash = parent->d_name.hash;
+                *max_len = sizeof(*cfh);
+                type = 2;
+        } else if (*max_len > sizeof(*fh)) {
+                if (connectable)
+                        return -ENOSPC;
+                dout("encode_fh %p\n", dentry);
+                fh->ino = ceph_ino(dentry->d_inode);
+                *max_len = sizeof(*fh);
+                type = 1;
+        } else {
+                return -ENOSPC;
+        }
+        return type;
+}
+/*
+ * convert regular fh to dentry
+ *
+ * FIXME: we should try harder by querying the mds for the ino.
+ */
+static struct dentry *__fh_to_dentry(struct super_block *sb,
+                                     struct ceph_nfs_fh *fh)
+{
+        struct inode *inode;
+        struct dentry *dentry;
+        struct ceph_vino vino;
+        int err;
+        dout("__fh_to_dentry %llx\n", fh->ino);
+        vino.ino = fh->ino;
+        vino.snap = CEPH_NOSNAP;
+        inode = ceph_find_inode(sb, vino);
+        if (!inode)
+                return ERR_PTR(-ESTALE);
+        dentry = d_obtain_alias(inode);
+        if (!dentry) {
+                pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
+                       fh->ino, inode);
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        err = ceph_init_dentry(dentry);
+        if (err < 0) {
+                iput(inode);
+                return ERR_PTR(err);
+        }
+        dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
+        return dentry;
+}
+/*
+ * convert connectable fh to dentry
+ */
+static struct dentry *__cfh_to_dentry(struct super_block *sb,
+                                      struct ceph_nfs_confh *cfh)
+{
+        struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
+        struct inode *inode;
+        struct dentry *dentry;
+        struct ceph_vino vino;
+        int err;
+        dout("__cfh_to_dentry %llx (%llx/%x)\n",
+             cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
+        vino.ino = cfh->ino;
+        vino.snap = CEPH_NOSNAP;
+        inode = ceph_find_inode(sb, vino);
+        if (!inode) {
+                struct ceph_mds_request *req;
+                req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
+                                               USE_ANY_MDS);
+                if (IS_ERR(req))
+                        return ERR_PTR(PTR_ERR(req));
+                req->r_ino1 = vino;
+                req->r_ino2.ino = cfh->parent_ino;
+                req->r_ino2.snap = CEPH_NOSNAP;
+                req->r_path2 = kmalloc(16, GFP_NOFS);
+                snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
+                req->r_num_caps = 1;
+                err = ceph_mdsc_do_request(mdsc, NULL, req);
+                ceph_mdsc_put_request(req);
+                inode = ceph_find_inode(sb, vino);
+                if (!inode)
+                        return ERR_PTR(err ? err : -ESTALE);
+        }
+        dentry = d_obtain_alias(inode);
+        if (!dentry) {
+                pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
+                       cfh->ino, inode);
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        err = ceph_init_dentry(dentry);
+        if (err < 0) {
+                iput(inode);
+                return ERR_PTR(err);
+        }
+        dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
+        return dentry;
+}
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
+                                        int fh_len, int fh_type)
+{
+        if (fh_type == 1)
+                return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
+        else
+                return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
+}
+/*
+ * get parent, if possible.
+ *
+ * FIXME: we could do better by querying the mds to discover the
+ * parent.
+ */
+static struct dentry *ceph_fh_to_parent(struct super_block *sb,
+                                         struct fid *fid,
+                                        int fh_len, int fh_type)
+{
+        struct ceph_nfs_confh *cfh = (void *)fid->raw;
+        struct ceph_vino vino;
+        struct inode *inode;
+        struct dentry *dentry;
+        int err;
+        if (fh_type == 1)
+                return ERR_PTR(-ESTALE);
+        pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
+                 cfh->parent_name_hash);
+        vino.ino = cfh->ino;
+        vino.snap = CEPH_NOSNAP;
+        inode = ceph_find_inode(sb, vino);
+        if (!inode)
+                return ERR_PTR(-ESTALE);
+        dentry = d_obtain_alias(inode);
+        if (!dentry) {
+                pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
+                       cfh->ino, inode);
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        err = ceph_init_dentry(dentry);
+        if (err < 0) {
+                iput(inode);
+                return ERR_PTR(err);
+        }
+        dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
+        return dentry;
+}
+const struct export_operations ceph_export_ops = {
+        .encode_fh = ceph_encode_fh,
+        .fh_to_dentry = ceph_fh_to_dentry,
+        .fh_to_parent = ceph_fh_to_parent,
+};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..ed6f19721d6e
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,939 @@
+#include "ceph_debug.h"
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include "super.h"
+#include "mds_client.h"
+/*
+ * Ceph file operations
+ *
+ * Implement basic open/close functionality, and implement
+ * read/write.
+ *
+ * We implement three modes of file I/O:
+ *  - buffered uses the generic_file_aio_{read,write} helpers
+ *
+ *  - synchronous is used when there is multi-client read/write
+ *    sharing, avoids the page cache, and synchronously waits for an
+ *    ack from the OSD.
+ *
+ *  - direct io takes the variant of the sync path that references
+ *    user pages directly.
+ *
+ * fsync() flushes and waits on dirty pages, but just queues metadata
+ * for writeback: since the MDS can recover size and mtime there is no
+ * need to wait for MDS acknowledgement.
+ */
+/*
+ * Prepare an open request.  Preallocate ceph_cap to avoid an
+ * inopportune ENOMEM later.
+ */
+static struct ceph_mds_request *
+prepare_open_request(struct super_block *sb, int flags, int create_mode)
+{
+        struct ceph_client *client = ceph_sb_to_client(sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int want_auth = USE_ANY_MDS;
+        int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
+        if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
+                want_auth = USE_AUTH_MDS;
+        req = ceph_mdsc_create_request(mdsc, op, want_auth);
+        if (IS_ERR(req))
+                goto out;
+        req->r_fmode = ceph_flags_to_mode(flags);
+        req->r_args.open.flags = cpu_to_le32(flags);
+        req->r_args.open.mode = cpu_to_le32(create_mode);
+        req->r_args.open.preferred = cpu_to_le32(-1);
+out:
+        return req;
+}
+/*
+ * initialize private struct file data.
+ * if we fail, clean up by dropping fmode reference on the ceph_inode
+ */
+static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
+{
+        struct ceph_file_info *cf;
+        int ret = 0;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFDIR:
+                dout("init_file %p %p 0%o (regular)\n", inode, file,
+                     inode->i_mode);
+                cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
+                if (cf == NULL) {
+                        ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+                        return -ENOMEM;
+                }
+                cf->fmode = fmode;
+                cf->next_offset = 2;
+                file->private_data = cf;
+                BUG_ON(inode->i_fop->release != ceph_release);
+                break;
+        case S_IFLNK:
+                dout("init_file %p %p 0%o (symlink)\n", inode, file,
+                     inode->i_mode);
+                ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+                break;
+        default:
+                dout("init_file %p %p 0%o (special)\n", inode, file,
+                     inode->i_mode);
+                /*
+                 * we need to drop the open ref now, since we don't
+                 * have .release set to ceph_release.
+                 */
+                ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+                BUG_ON(inode->i_fop->release == ceph_release);
+                /* call the proper open fop */
+                ret = inode->i_fop->open(inode, file);
+        }
+        return ret;
+}
+/*
+ * If the filp already has private_data, that means the file was
+ * already opened by intent during lookup, and we do nothing.
+ *
+ * If we already have the requisite capabilities, we can satisfy
+ * the open request locally (no need to request new caps from the
+ * MDS).  We do, however, need to inform the MDS (asynchronously)
+ * if our wanted caps set expands.
+ */
+int ceph_open(struct inode *inode, struct file *file)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        struct ceph_file_info *cf = file->private_data;
+        struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+        int err;
+        int flags, fmode, wanted;
+        if (cf) {
+                dout("open file %p is already opened\n", file);
+                return 0;
+        }
+        /* filter out O_CREAT|O_EXCL; vfs did that already.  yuck. */
+        flags = file->f_flags & ~(O_CREAT|O_EXCL);
+        if (S_ISDIR(inode->i_mode))
+                flags = O_DIRECTORY;  /* mds likes to know */
+        dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
+             ceph_vinop(inode), file, flags, file->f_flags);
+        fmode = ceph_flags_to_mode(flags);
+        wanted = ceph_caps_for_mode(fmode);
+        /* snapped files are read-only */
+        if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
+                return -EROFS;
+        /* trivially open snapdir */
+        if (ceph_snap(inode) == CEPH_SNAPDIR) {
+                spin_lock(&inode->i_lock);
+                __ceph_get_fmode(ci, fmode);
+                spin_unlock(&inode->i_lock);
+                return ceph_init_file(inode, file, fmode);
+        }
+        /*
+         * No need to block if we have any caps.  Update wanted set
+         * asynchronously.
+         */
+        spin_lock(&inode->i_lock);
+        if (__ceph_is_any_real_caps(ci)) {
+                int mds_wanted = __ceph_caps_mds_wanted(ci);
+                int issued = __ceph_caps_issued(ci, NULL);
+                dout("open %p fmode %d want %s issued %s using existing\n",
+                     inode, fmode, ceph_cap_string(wanted),
+                     ceph_cap_string(issued));
+                __ceph_get_fmode(ci, fmode);
+                spin_unlock(&inode->i_lock);
+                /* adjust wanted? */
+                if ((issued & wanted) != wanted &&
+                    (mds_wanted & wanted) != wanted &&
+                    ceph_snap(inode) != CEPH_SNAPDIR)
+                        ceph_check_caps(ci, 0, NULL);
+                return ceph_init_file(inode, file, fmode);
+        } else if (ceph_snap(inode) != CEPH_NOSNAP &&
+                   (ci->i_snap_caps & wanted) == wanted) {
+                __ceph_get_fmode(ci, fmode);
+                spin_unlock(&inode->i_lock);
+                return ceph_init_file(inode, file, fmode);
+        }
+        spin_unlock(&inode->i_lock);
+        dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
+        req = prepare_open_request(inode->i_sb, flags, 0);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                goto out;
+        }
+        req->r_inode = igrab(inode);
+        req->r_num_caps = 1;
+        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        if (!err)
+                err = ceph_init_file(inode, file, req->r_fmode);
+        ceph_mdsc_put_request(req);
+        dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
+out:
+        return err;
+}
+/*
+ * Do a lookup + open with a single request.
+ *
+ * If this succeeds, but some subsequent check in the vfs
+ * may_open() fails, the struct *file gets cleaned up (i.e.
+ * ceph_release gets called).  So fear not!
+ */
+/*
+ * flags
+ *  path_lookup_open   -> LOOKUP_OPEN
+ *  path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
+ */
+struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+                                struct nameidata *nd, int mode,
+                                int locked_dir)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct file *file = nd->intent.open.file;
+        struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
+        struct ceph_mds_request *req;
+        int err;
+        int flags = nd->intent.open.flags - 1;  /* silly vfs! */
+        dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
+             dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
+        /* do the open */
+        req = prepare_open_request(dir->i_sb, flags, mode);
+        if (IS_ERR(req))
+                return ERR_PTR(PTR_ERR(req));
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        if (flags & O_CREAT) {
+                req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+                req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        }
+        req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
+        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        dentry = ceph_finish_lookup(req, dentry, err);
+        if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+                err = ceph_handle_notrace_create(dir, dentry);
+        if (!err)
+                err = ceph_init_file(req->r_dentry->d_inode, file,
+                                     req->r_fmode);
+        ceph_mdsc_put_request(req);
+        dout("ceph_lookup_open result=%p\n", dentry);
+        return dentry;
+}
+int ceph_release(struct inode *inode, struct file *file)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_file_info *cf = file->private_data;
+        dout("release inode %p file %p\n", inode, file);
+        ceph_put_fmode(ci, cf->fmode);
+        if (cf->last_readdir)
+                ceph_mdsc_put_request(cf->last_readdir);
+        kfree(cf->last_name);
+        kfree(cf->dir_info);
+        dput(cf->dentry);
+        kmem_cache_free(ceph_file_cachep, cf);
+        /* wake up anyone waiting for caps on this inode */
+        wake_up(&ci->i_cap_wq);
+        return 0;
+}
+/*
+ * build a vector of user pages
+ */
+static struct page **get_direct_page_vector(const char __user *data,
+                                            int num_pages,
+                                            loff_t off, size_t len)
+{
+        struct page **pages;
+        int rc;
+        pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+        if (!pages)
+                return ERR_PTR(-ENOMEM);
+        down_read(&current->mm->mmap_sem);
+        rc = get_user_pages(current, current->mm, (unsigned long)data,
+                            num_pages, 0, 0, pages, NULL);
+        up_read(&current->mm->mmap_sem);
+        if (rc < 0)
+                goto fail;
+        return pages;
+fail:
+        kfree(pages);
+        return ERR_PTR(rc);
+}
+static void put_page_vector(struct page **pages, int num_pages)
+{
+        int i;
+        for (i = 0; i < num_pages; i++)
+                put_page(pages[i]);
+        kfree(pages);
+}
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+        int i;
+        for (i = 0; i < num_pages; i++)
+                __free_pages(pages[i], 0);
+        kfree(pages);
+}
+/*
+ * allocate a vector new pages
+ */
+static struct page **alloc_page_vector(int num_pages)
+{
+        struct page **pages;
+        int i;
+        pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+        if (!pages)
+                return ERR_PTR(-ENOMEM);
+        for (i = 0; i < num_pages; i++) {
+                pages[i] = alloc_page(GFP_NOFS);
+                if (pages[i] == NULL) {
+                        ceph_release_page_vector(pages, i);
+                        return ERR_PTR(-ENOMEM);
+                }
+        }
+        return pages;
+}
+/*
+ * copy user data into a page vector
+ */
+static int copy_user_to_page_vector(struct page **pages,
+                                    const char __user *data,
+                                    loff_t off, size_t len)
+{
+        int i = 0;
+        int po = off & ~PAGE_CACHE_MASK;
+        int left = len;
+        int l, bad;
+        while (left > 0) {
+                l = min_t(int, PAGE_CACHE_SIZE-po, left);
+                bad = copy_from_user(page_address(pages[i]) + po, data, l);
+                if (bad == l)
+                        return -EFAULT;
+                data += l - bad;
+                left -= l - bad;
+                po += l - bad;
+                if (po == PAGE_CACHE_SIZE) {
+                        po = 0;
+                        i++;
+                }
+        }
+        return len;
+}
+/*
+ * copy user data from a page vector into a user pointer
+ */
+static int copy_page_vector_to_user(struct page **pages, char __user *data,
+                                    loff_t off, size_t len)
+{
+        int i = 0;
+        int po = off & ~PAGE_CACHE_MASK;
+        int left = len;
+        int l, bad;
+        while (left > 0) {
+                l = min_t(int, left, PAGE_CACHE_SIZE-po);
+                bad = copy_to_user(data, page_address(pages[i]) + po, l);
+                if (bad == l)
+                        return -EFAULT;
+                data += l - bad;
+                left -= l - bad;
+                if (po) {
+                        po += l - bad;
+                        if (po == PAGE_CACHE_SIZE)
+                                po = 0;
+                }
+                i++;
+        }
+        return len;
+}
+/*
+ * Zero an extent within a page vector.  Offset is relative to the
+ * start of the first page.
+ */
+static void zero_page_vector_range(int off, int len, struct page **pages)
+{
+        int i = off >> PAGE_CACHE_SHIFT;
+        off &= ~PAGE_CACHE_MASK;
+        dout("zero_page_vector_page %u~%u\n", off, len);
+        /* leading partial page? */
+        if (off) {
+                int end = min((int)PAGE_CACHE_SIZE, off + len);
+                dout("zeroing %d %p head from %d\n", i, pages[i],
+                     (int)off);
+                zero_user_segment(pages[i], off, end);
+                len -= (end - off);
+                i++;
+        }
+        while (len >= PAGE_CACHE_SIZE) {
+                dout("zeroing %d %p len=%d\n", i, pages[i], len);
+                zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+                len -= PAGE_CACHE_SIZE;
+                i++;
+        }
+        /* trailing partial page? */
+        if (len) {
+                dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+                zero_user_segment(pages[i], 0, len);
+        }
+}
+/*
+ * Read a range of bytes striped over one or more objects.  Iterate over
+ * objects we stripe over.  (That's not atomic, but good enough for now.)
+ *
+ * If we get a short result from the OSD, check against i_size; we need to
+ * only return a short read to the caller if we hit EOF.
+ */
+static int striped_read(struct inode *inode,
+                        u64 off, u64 len,
+                        struct page **pages, int num_pages,
+                        int *checkeof)
+{
+        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        u64 pos, this_len;
+        int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
+        int left, pages_left;
+        int read;
+        struct page **page_pos;
+        int ret;
+        bool hit_stripe, was_short;
+        /*
+         * we may need to do multiple reads.  not atomic, unfortunately.
+         */
+        pos = off;
+        left = len;
+        page_pos = pages;
+        pages_left = num_pages;
+        read = 0;
+more:
+        this_len = left;
+        ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+                                  &ci->i_layout, pos, &this_len,
+                                  ci->i_truncate_seq,
+                                  ci->i_truncate_size,
+                                  page_pos, pages_left);
+        hit_stripe = this_len < left;
+        was_short = ret >= 0 && ret < this_len;
+        if (ret == -ENOENT)
+                ret = 0;
+        dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
+             ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
+        if (ret > 0) {
+                int didpages =
+                        ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
+                if (read < pos - off) {
+                        dout(" zero gap %llu to %llu\n", off + read, pos);
+                        zero_page_vector_range(page_off + read,
+                                               pos - off - read, pages);
+                }
+                pos += ret;
+                read = pos - off;
+                left -= ret;
+                page_pos += didpages;
+                pages_left -= didpages;
+                /* hit stripe? */
+                if (left && hit_stripe)
+                        goto more;
+        }
+        if (was_short) {
+                /* was original extent fully inside i_size? */
+                if (pos + left <= inode->i_size) {
+                        dout("zero tail\n");
+                        zero_page_vector_range(page_off + read, len - read,
+                                               pages);
+                        read = len;
+                        goto out;
+                }
+                /* check i_size */
+                *checkeof = 1;
+        }
+out:
+        if (ret >= 0)
+                ret = read;
+        dout("striped_read returns %d\n", ret);
+        return ret;
+}
+/*
+ * Completely synchronous read and write methods.  Direct from __user
+ * buffer to osd, or directly to user pages (if O_DIRECT).
+ *
+ * If the read spans object boundary, just do multiple reads.
+ */
+static ssize_t ceph_sync_read(struct file *file, char __user *data,
+                              unsigned len, loff_t *poff, int *checkeof)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct page **pages;
+        u64 off = *poff;
+        int num_pages = calc_pages_for(off, len);
+        int ret;
+        dout("sync_read on file %p %llu~%u %s\n", file, off, len,
+             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+        if (file->f_flags & O_DIRECT) {
+                pages = get_direct_page_vector(data, num_pages, off, len);
+                /*
+                 * flush any page cache pages in this range.  this
+                 * will make concurrent normal and O_DIRECT io slow,
+                 * but it will at least behave sensibly when they are
+                 * in sequence.
+                 */
+        } else {
+                pages = alloc_page_vector(num_pages);
+        }
+        if (IS_ERR(pages))
+                return PTR_ERR(pages);
+        ret = filemap_write_and_wait(inode->i_mapping);
+        if (ret < 0)
+                goto done;
+        ret = striped_read(inode, off, len, pages, num_pages, checkeof);
+        if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
+                ret = copy_page_vector_to_user(pages, data, off, ret);
+        if (ret >= 0)
+                *poff = off + ret;
+done:
+        if (file->f_flags & O_DIRECT)
+                put_page_vector(pages, num_pages);
+        else
+                ceph_release_page_vector(pages, num_pages);
+        dout("sync_read result %d\n", ret);
+        return ret;
+}
+/*
+ * Write commit callback, called if we requested both an ACK and
+ * ONDISK commit reply from the OSD.
+ */
+static void sync_write_commit(struct ceph_osd_request *req,
+                              struct ceph_msg *msg)
+{
+        struct ceph_inode_info *ci = ceph_inode(req->r_inode);
+        dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
+        spin_lock(&ci->i_unsafe_lock);
+        list_del_init(&req->r_unsafe_item);
+        spin_unlock(&ci->i_unsafe_lock);
+        ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+}
+/*
+ * Synchronous write, straight from __user pointer or user pages (if
+ * O_DIRECT).
+ *
+ * If write spans object boundary, just do multiple writes.  (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t ceph_sync_write(struct file *file, const char __user *data,
+                               size_t left, loff_t *offset)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_osd_request *req;
+        struct page **pages;
+        int num_pages;
+        long long unsigned pos;
+        u64 len;
+        int written = 0;
+        int flags;
+        int do_sync = 0;
+        int check_caps = 0;
+        int ret;
+        struct timespec mtime = CURRENT_TIME;
+        if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
+                return -EROFS;
+        dout("sync_write on file %p %lld~%u %s\n", file, *offset,
+             (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+        if (file->f_flags & O_APPEND)
+                pos = i_size_read(inode);
+        else
+                pos = *offset;
+        ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
+        if (ret < 0)
+                return ret;
+        ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                            pos >> PAGE_CACHE_SHIFT,
+                                            (pos + left) >> PAGE_CACHE_SHIFT);
+        if (ret < 0)
+                dout("invalidate_inode_pages2_range returned %d\n", ret);
+        flags = CEPH_OSD_FLAG_ORDERSNAP |
+                CEPH_OSD_FLAG_ONDISK |
+                CEPH_OSD_FLAG_WRITE;
+        if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
+                flags |= CEPH_OSD_FLAG_ACK;
+        else
+                do_sync = 1;
+        /*
+         * we may need to do multiple writes here if we span an object
+         * boundary.  this isn't atomic, unfortunately.  :(
+         */
+more:
+        len = left;
+        req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+                                    ceph_vino(inode), pos, &len,
+                                    CEPH_OSD_OP_WRITE, flags,
+                                    ci->i_snap_realm->cached_context,
+                                    do_sync,
+                                    ci->i_truncate_seq, ci->i_truncate_size,
+                                    &mtime, false, 2);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        num_pages = calc_pages_for(pos, len);
+        if (file->f_flags & O_DIRECT) {
+                pages = get_direct_page_vector(data, num_pages, pos, len);
+                if (IS_ERR(pages)) {
+                        ret = PTR_ERR(pages);
+                        goto out;
+                }
+                /*
+                 * throw out any page cache pages in this range. this
+                 * may block.
+                 */
+                truncate_inode_pages_range(inode->i_mapping, pos, 
+                                           (pos+len) | (PAGE_CACHE_SIZE-1));
+        } else {
+                pages = alloc_page_vector(num_pages);
+                if (IS_ERR(pages)) {
+                        ret = PTR_ERR(pages);
+                        goto out;
+                }
+                ret = copy_user_to_page_vector(pages, data, pos, len);
+                if (ret < 0) {
+                        ceph_release_page_vector(pages, num_pages);
+                        goto out;
+                }
+                if ((file->f_flags & O_SYNC) == 0) {
+                        /* get a second commit callback */
+                        req->r_safe_callback = sync_write_commit;
+                        req->r_own_pages = 1;
+                }
+        }
+        req->r_pages = pages;
+        req->r_num_pages = num_pages;
+        req->r_inode = inode;
+        ret = ceph_osdc_start_request(&client->osdc, req, false);
+        if (!ret) {
+                if (req->r_safe_callback) {
+                        /*
+                         * Add to inode unsafe list only after we
+                         * start_request so that a tid has been assigned.
+                         */
+                        spin_lock(&ci->i_unsafe_lock);
+                        list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
+                        spin_unlock(&ci->i_unsafe_lock);
+                        ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
+                }
+                ret = ceph_osdc_wait_request(&client->osdc, req);
+        }
+        if (file->f_flags & O_DIRECT)
+                put_page_vector(pages, num_pages);
+        else if (file->f_flags & O_SYNC)
+                ceph_release_page_vector(pages, num_pages);
+out:
+        ceph_osdc_put_request(req);
+        if (ret == 0) {
+                pos += len;
+                written += len;
+                left -= len;
+                if (left)
+                        goto more;
+                ret = written;
+                *offset = pos;
+                if (pos > i_size_read(inode))
+                        check_caps = ceph_inode_set_size(inode, pos);
+                if (check_caps)
+                        ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
+                                        NULL);
+        }
+        return ret;
+}
+/*
+ * Wrap generic_file_aio_read with checks for cap bits on the inode.
+ * Atomically grab references, so that those bits are not released
+ * back to the MDS mid-read.
+ *
+ * Hmm, the sync read case isn't actually async... should it be?
+ */
+static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
+                             unsigned long nr_segs, loff_t pos)
+{
+        struct file *filp = iocb->ki_filp;
+        loff_t *ppos = &iocb->ki_pos;
+        size_t len = iov->iov_len;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        void *base = iov->iov_base;
+        ssize_t ret;
+        int got = 0;
+        int checkeof = 0, read = 0;
+        dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+             inode, ceph_vinop(inode), pos, (unsigned)len, inode);
+again:
+        __ceph_do_pending_vmtruncate(inode);
+        ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
+                            &got, -1);
+        if (ret < 0)
+                goto out;
+        dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+             inode, ceph_vinop(inode), pos, (unsigned)len,
+             ceph_cap_string(got));
+        if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
+            (iocb->ki_filp->f_flags & O_DIRECT) ||
+            (inode->i_sb->s_flags & MS_SYNCHRONOUS))
+                /* hmm, this isn't really async... */
+                ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
+        else
+                ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+out:
+        dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
+             inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+        ceph_put_cap_refs(ci, got);
+        if (checkeof && ret >= 0) {
+                int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+                /* hit EOF or hole? */
+                if (statret == 0 && *ppos < inode->i_size) {
+                        dout("aio_read sync_read hit hole, reading more\n");
+                        read += ret;
+                        base += ret;
+                        len -= ret;
+                        checkeof = 0;
+                        goto again;
+                }
+        }
+        if (ret >= 0)
+                ret += read;
+        return ret;
+}
+/*
+ * Take cap references to avoid releasing caps to MDS mid-write.
+ *
+ * If we are synchronous, and write with an old snap context, the OSD
+ * may return EOLDSNAPC.  In that case, retry the write.. _after_
+ * dropping our cap refs and allowing the pending snap to logically
+ * complete _before_ this write occurs.
+ *
+ * If we are near ENOSPC, write synchronously.
+ */
+static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
+                       unsigned long nr_segs, loff_t pos)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+        loff_t endoff = pos + iov->iov_len;
+        int got = 0;
+        int ret, err;
+        if (ceph_snap(inode) != CEPH_NOSNAP)
+                return -EROFS;
+retry_snap:
+        if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
+                return -ENOSPC;
+        __ceph_do_pending_vmtruncate(inode);
+        dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+             inode->i_size);
+        ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+                            &got, endoff);
+        if (ret < 0)
+                goto out;
+        dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
+             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+             ceph_cap_string(got));
+        if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
+            (iocb->ki_filp->f_flags & O_DIRECT) ||
+            (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
+                ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
+                        &iocb->ki_pos);
+        } else {
+                ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+                if ((ret >= 0 || ret == -EIOCBQUEUED) &&
+                    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
+                     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
+                        err = vfs_fsync_range(file, file->f_path.dentry,
+                                              pos, pos + ret - 1, 1);
+                        if (err < 0)
+                                ret = err;
+                }
+        }
+        if (ret >= 0) {
+                spin_lock(&inode->i_lock);
+                __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+                spin_unlock(&inode->i_lock);
+        }
+out:
+        dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
+             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+             ceph_cap_string(got));
+        ceph_put_cap_refs(ci, got);
+        if (ret == -EOLDSNAPC) {
+                dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
+                     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
+                goto retry_snap;
+        }
+        return ret;
+}
+/*
+ * llseek.  be sure to verify file size on SEEK_END.
+ */
+static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct inode *inode = file->f_mapping->host;
+        int ret;
+        mutex_lock(&inode->i_mutex);
+        __ceph_do_pending_vmtruncate(inode);
+        switch (origin) {
+        case SEEK_END:
+                ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+                if (ret < 0) {
+                        offset = ret;
+                        goto out;
+                }
+                offset += inode->i_size;
+                break;
+        case SEEK_CUR:
+                /*
+                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+                 * position-querying operation.  Avoid rewriting the "same"
+                 * f_pos value back to the file because a concurrent read(),
+                 * write() or lseek() might have altered it
+                 */
+                if (offset == 0) {
+                        offset = file->f_pos;
+                        goto out;
+                }
+                offset += file->f_pos;
+                break;
+        }
+        if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
+                offset = -EINVAL;
+                goto out;
+        }
+        /* Special lock needed here? */
+        if (offset != file->f_pos) {
+                file->f_pos = offset;
+                file->f_version = 0;
+        }
+out:
+        mutex_unlock(&inode->i_mutex);
+        return offset;
+}
+const struct file_operations ceph_file_fops = {
+        .open = ceph_open,
+        .release = ceph_release,
+        .llseek = ceph_llseek,
+        .read = do_sync_read,
+        .write = do_sync_write,
+        .aio_read = ceph_aio_read,
+        .aio_write = ceph_aio_write,
+        .mmap = ceph_mmap,
+        .fsync = ceph_fsync,
+        .splice_read = generic_file_splice_read,
+        .splice_write = generic_file_splice_write,
+        .unlocked_ioctl = ceph_ioctl,
+        .compat_ioctl   = ceph_ioctl,
+};
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..85b4d2ffdeba
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1782 @@
+#include "ceph_debug.h"
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include <linux/vmalloc.h>
+#include <linux/pagevec.h>
+#include "super.h"
+#include "decode.h"
+/*
+ * Ceph inode operations
+ *
+ * Implement basic inode helpers (get, alloc) and inode ops (getattr,
+ * setattr, etc.), xattr helpers, and helpers for assimilating
+ * metadata returned by the MDS into our cache.
+ *
+ * Also define helpers for doing asynchronous writeback, invalidation,
+ * and truncation for the benefit of those who can't afford to block
+ * (typically because they are in the message handler path).
+ */
+static const struct inode_operations ceph_symlink_iops;
+static void ceph_invalidate_work(struct work_struct *work);
+static void ceph_writeback_work(struct work_struct *work);
+static void ceph_vmtruncate_work(struct work_struct *work);
+/*
+ * find or create an inode, given the ceph ino number
+ */
+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+{
+        struct inode *inode;
+        ino_t t = ceph_vino_to_ino(vino);
+        inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+        if (inode == NULL)
+                return ERR_PTR(-ENOMEM);
+        if (inode->i_state & I_NEW) {
+                dout("get_inode created new inode %p %llx.%llx ino %llx\n",
+                     inode, ceph_vinop(inode), (u64)inode->i_ino);
+                unlock_new_inode(inode);
+        }
+        dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
+             vino.snap, inode);
+        return inode;
+}
+/*
+ * get/constuct snapdir inode for a given directory
+ */
+struct inode *ceph_get_snapdir(struct inode *parent)
+{
+        struct ceph_vino vino = {
+                .ino = ceph_ino(parent),
+                .snap = CEPH_SNAPDIR,
+        };
+        struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        BUG_ON(!S_ISDIR(parent->i_mode));
+        if (IS_ERR(inode))
+                return ERR_PTR(PTR_ERR(inode));
+        inode->i_mode = parent->i_mode;
+        inode->i_uid = parent->i_uid;
+        inode->i_gid = parent->i_gid;
+        inode->i_op = &ceph_dir_iops;
+        inode->i_fop = &ceph_dir_fops;
+        ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+        ci->i_rbytes = 0;
+        return inode;
+}
+const struct inode_operations ceph_file_iops = {
+        .permission = ceph_permission,
+        .setattr = ceph_setattr,
+        .getattr = ceph_getattr,
+        .setxattr = ceph_setxattr,
+        .getxattr = ceph_getxattr,
+        .listxattr = ceph_listxattr,
+        .removexattr = ceph_removexattr,
+};
+/*
+ * We use a 'frag tree' to keep track of the MDS's directory fragments
+ * for a given inode (usually there is just a single fragment).  We
+ * need to know when a child frag is delegated to a new MDS, or when
+ * it is flagged as replicated, so we can direct our requests
+ * accordingly.
+ */
+/*
+ * find/create a frag in the tree
+ */
+static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
+                                                    u32 f)
+{
+        struct rb_node **p;
+        struct rb_node *parent = NULL;
+        struct ceph_inode_frag *frag;
+        int c;
+        p = &ci->i_fragtree.rb_node;
+        while (*p) {
+                parent = *p;
+                frag = rb_entry(parent, struct ceph_inode_frag, node);
+                c = ceph_frag_compare(f, frag->frag);
+                if (c < 0)
+                        p = &(*p)->rb_left;
+                else if (c > 0)
+                        p = &(*p)->rb_right;
+                else
+                        return frag;
+        }
+        frag = kmalloc(sizeof(*frag), GFP_NOFS);
+        if (!frag) {
+                pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
+                       "frag %x\n", &ci->vfs_inode,
+                       ceph_vinop(&ci->vfs_inode), f);
+                return ERR_PTR(-ENOMEM);
+        }
+        frag->frag = f;
+        frag->split_by = 0;
+        frag->mds = -1;
+        frag->ndist = 0;
+        rb_link_node(&frag->node, parent, p);
+        rb_insert_color(&frag->node, &ci->i_fragtree);
+        dout("get_or_create_frag added %llx.%llx frag %x\n",
+             ceph_vinop(&ci->vfs_inode), f);
+        return frag;
+}
+/*
+ * find a specific frag @f
+ */
+struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
+{
+        struct rb_node *n = ci->i_fragtree.rb_node;
+        while (n) {
+                struct ceph_inode_frag *frag =
+                        rb_entry(n, struct ceph_inode_frag, node);
+                int c = ceph_frag_compare(f, frag->frag);
+                if (c < 0)
+                        n = n->rb_left;
+                else if (c > 0)
+                        n = n->rb_right;
+                else
+                        return frag;
+        }
+        return NULL;
+}
+/*
+ * Choose frag containing the given value @v.  If @pfrag is
+ * specified, copy the frag delegation info to the caller if
+ * it is present.
+ */
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+                     struct ceph_inode_frag *pfrag,
+                     int *found)
+{
+        u32 t = ceph_frag_make(0, 0);
+        struct ceph_inode_frag *frag;
+        unsigned nway, i;
+        u32 n;
+        if (found)
+                *found = 0;
+        mutex_lock(&ci->i_fragtree_mutex);
+        while (1) {
+                WARN_ON(!ceph_frag_contains_value(t, v));
+                frag = __ceph_find_frag(ci, t);
+                if (!frag)
+                        break; /* t is a leaf */
+                if (frag->split_by == 0) {
+                        if (pfrag)
+                                memcpy(pfrag, frag, sizeof(*pfrag));
+                        if (found)
+                                *found = 1;
+                        break;
+                }
+                /* choose child */
+                nway = 1 << frag->split_by;
+                dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
+                     frag->split_by, nway);
+                for (i = 0; i < nway; i++) {
+                        n = ceph_frag_make_child(t, frag->split_by, i);
+                        if (ceph_frag_contains_value(n, v)) {
+                                t = n;
+                                break;
+                        }
+                }
+                BUG_ON(i == nway);
+        }
+        dout("choose_frag(%x) = %x\n", v, t);
+        mutex_unlock(&ci->i_fragtree_mutex);
+        return t;
+}
+/*
+ * Process dirfrag (delegation) info from the mds.  Include leaf
+ * fragment in tree ONLY if ndist > 0.  Otherwise, only
+ * branches/splits are included in i_fragtree)
+ */
+static int ceph_fill_dirfrag(struct inode *inode,
+                             struct ceph_mds_reply_dirfrag *dirinfo)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_inode_frag *frag;
+        u32 id = le32_to_cpu(dirinfo->frag);
+        int mds = le32_to_cpu(dirinfo->auth);
+        int ndist = le32_to_cpu(dirinfo->ndist);
+        int i;
+        int err = 0;
+        mutex_lock(&ci->i_fragtree_mutex);
+        if (ndist == 0) {
+                /* no delegation info needed. */
+                frag = __ceph_find_frag(ci, id);
+                if (!frag)
+                        goto out;
+                if (frag->split_by == 0) {
+                        /* tree leaf, remove */
+                        dout("fill_dirfrag removed %llx.%llx frag %x"
+                             " (no ref)\n", ceph_vinop(inode), id);
+                        rb_erase(&frag->node, &ci->i_fragtree);
+                        kfree(frag);
+                } else {
+                        /* tree branch, keep and clear */
+                        dout("fill_dirfrag cleared %llx.%llx frag %x"
+                             " referral\n", ceph_vinop(inode), id);
+                        frag->mds = -1;
+                        frag->ndist = 0;
+                }
+                goto out;
+        }
+        /* find/add this frag to store mds delegation info */
+        frag = __get_or_create_frag(ci, id);
+        if (IS_ERR(frag)) {
+                /* this is not the end of the world; we can continue
+                   with bad/inaccurate delegation info */
+                pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
+                       ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
+                err = -ENOMEM;
+                goto out;
+        }
+        frag->mds = mds;
+        frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
+        for (i = 0; i < frag->ndist; i++)
+                frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
+        dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
+             ceph_vinop(inode), frag->frag, frag->ndist);
+out:
+        mutex_unlock(&ci->i_fragtree_mutex);
+        return err;
+}
+/*
+ * initialize a newly allocated inode.
+ */
+struct inode *ceph_alloc_inode(struct super_block *sb)
+{
+        struct ceph_inode_info *ci;
+        int i;
+        ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+        if (!ci)
+                return NULL;
+        dout("alloc_inode %p\n", &ci->vfs_inode);
+        ci->i_version = 0;
+        ci->i_time_warp_seq = 0;
+        ci->i_ceph_flags = 0;
+        ci->i_release_count = 0;
+        ci->i_symlink = NULL;
+        ci->i_fragtree = RB_ROOT;
+        mutex_init(&ci->i_fragtree_mutex);
+        ci->i_xattrs.blob = NULL;
+        ci->i_xattrs.prealloc_blob = NULL;
+        ci->i_xattrs.dirty = false;
+        ci->i_xattrs.index = RB_ROOT;
+        ci->i_xattrs.count = 0;
+        ci->i_xattrs.names_size = 0;
+        ci->i_xattrs.vals_size = 0;
+        ci->i_xattrs.version = 0;
+        ci->i_xattrs.index_version = 0;
+        ci->i_caps = RB_ROOT;
+        ci->i_auth_cap = NULL;
+        ci->i_dirty_caps = 0;
+        ci->i_flushing_caps = 0;
+        INIT_LIST_HEAD(&ci->i_dirty_item);
+        INIT_LIST_HEAD(&ci->i_flushing_item);
+        ci->i_cap_flush_seq = 0;
+        ci->i_cap_flush_last_tid = 0;
+        memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+        init_waitqueue_head(&ci->i_cap_wq);
+        ci->i_hold_caps_min = 0;
+        ci->i_hold_caps_max = 0;
+        INIT_LIST_HEAD(&ci->i_cap_delay_list);
+        ci->i_cap_exporting_mds = 0;
+        ci->i_cap_exporting_mseq = 0;
+        ci->i_cap_exporting_issued = 0;
+        INIT_LIST_HEAD(&ci->i_cap_snaps);
+        ci->i_head_snapc = NULL;
+        ci->i_snap_caps = 0;
+        for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+                ci->i_nr_by_mode[i] = 0;
+        ci->i_truncate_seq = 0;
+        ci->i_truncate_size = 0;
+        ci->i_truncate_pending = 0;
+        ci->i_max_size = 0;
+        ci->i_reported_size = 0;
+        ci->i_wanted_max_size = 0;
+        ci->i_requested_max_size = 0;
+        ci->i_pin_ref = 0;
+        ci->i_rd_ref = 0;
+        ci->i_rdcache_ref = 0;
+        ci->i_wr_ref = 0;
+        ci->i_wrbuffer_ref = 0;
+        ci->i_wrbuffer_ref_head = 0;
+        ci->i_shared_gen = 0;
+        ci->i_rdcache_gen = 0;
+        ci->i_rdcache_revoking = 0;
+        INIT_LIST_HEAD(&ci->i_unsafe_writes);
+        INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+        spin_lock_init(&ci->i_unsafe_lock);
+        ci->i_snap_realm = NULL;
+        INIT_LIST_HEAD(&ci->i_snap_realm_item);
+        INIT_LIST_HEAD(&ci->i_snap_flush_item);
+        INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
+        INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
+        INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
+        return &ci->vfs_inode;
+}
+void ceph_destroy_inode(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_inode_frag *frag;
+        struct rb_node *n;
+        dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+        ceph_queue_caps_release(inode);
+        /*
+         * we may still have a snap_realm reference if there are stray
+         * caps in i_cap_exporting_issued or i_snap_caps.
+         */
+        if (ci->i_snap_realm) {
+                struct ceph_mds_client *mdsc =
+                        &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+                struct ceph_snap_realm *realm = ci->i_snap_realm;
+                dout(" dropping residual ref to snap realm %p\n", realm);
+                spin_lock(&realm->inodes_with_caps_lock);
+                list_del_init(&ci->i_snap_realm_item);
+                spin_unlock(&realm->inodes_with_caps_lock);
+                ceph_put_snap_realm(mdsc, realm);
+        }
+        kfree(ci->i_symlink);
+        while ((n = rb_first(&ci->i_fragtree)) != NULL) {
+                frag = rb_entry(n, struct ceph_inode_frag, node);
+                rb_erase(n, &ci->i_fragtree);
+                kfree(frag);
+        }
+        __ceph_destroy_xattrs(ci);
+        if (ci->i_xattrs.blob)
+                ceph_buffer_put(ci->i_xattrs.blob);
+        if (ci->i_xattrs.prealloc_blob)
+                ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+        kmem_cache_free(ceph_inode_cachep, ci);
+}
+/*
+ * Helpers to fill in size, ctime, mtime, and atime.  We have to be
+ * careful because either the client or MDS may have more up to date
+ * info, depending on which capabilities are held, and whether
+ * time_warp_seq or truncate_seq have increased.  (Ordinarily, mtime
+ * and size are monotonically increasing, except when utimes() or
+ * truncate() increments the corresponding _seq values.)
+ */
+int ceph_fill_file_size(struct inode *inode, int issued,
+                        u32 truncate_seq, u64 truncate_size, u64 size)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int queue_trunc = 0;
+        if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
+            (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
+                dout("size %lld -> %llu\n", inode->i_size, size);
+                inode->i_size = size;
+                inode->i_blocks = (size + (1<<9) - 1) >> 9;
+                ci->i_reported_size = size;
+                if (truncate_seq != ci->i_truncate_seq) {
+                        dout("truncate_seq %u -> %u\n",
+                             ci->i_truncate_seq, truncate_seq);
+                        ci->i_truncate_seq = truncate_seq;
+                        /*
+                         * If we hold relevant caps, or in the case where we're
+                         * not the only client referencing this file and we
+                         * don't hold those caps, then we need to check whether
+                         * the file is either opened or mmaped
+                         */
+                        if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
+                                      CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
+                                      CEPH_CAP_FILE_EXCL)) ||
+                            mapping_mapped(inode->i_mapping) ||
+                            __ceph_caps_file_wanted(ci)) {
+                                ci->i_truncate_pending++;
+                                queue_trunc = 1;
+                        }
+                }
+        }
+        if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
+            ci->i_truncate_size != truncate_size) {
+                dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
+                     truncate_size);
+                ci->i_truncate_size = truncate_size;
+        }
+        return queue_trunc;
+}
+void ceph_fill_file_time(struct inode *inode, int issued,
+                         u64 time_warp_seq, struct timespec *ctime,
+                         struct timespec *mtime, struct timespec *atime)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int warn = 0;
+        if (issued & (CEPH_CAP_FILE_EXCL|
+                      CEPH_CAP_FILE_WR|
+                      CEPH_CAP_FILE_BUFFER)) {
+                if (timespec_compare(ctime, &inode->i_ctime) > 0) {
+                        dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
+                             inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+                             ctime->tv_sec, ctime->tv_nsec);
+                        inode->i_ctime = *ctime;
+                }
+                if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
+                        /* the MDS did a utimes() */
+                        dout("mtime %ld.%09ld -> %ld.%09ld "
+                             "tw %d -> %d\n",
+                             inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+                             mtime->tv_sec, mtime->tv_nsec,
+                             ci->i_time_warp_seq, (int)time_warp_seq);
+                        inode->i_mtime = *mtime;
+                        inode->i_atime = *atime;
+                        ci->i_time_warp_seq = time_warp_seq;
+                } else if (time_warp_seq == ci->i_time_warp_seq) {
+                        /* nobody did utimes(); take the max */
+                        if (timespec_compare(mtime, &inode->i_mtime) > 0) {
+                                dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
+                                     inode->i_mtime.tv_sec,
+                                     inode->i_mtime.tv_nsec,
+                                     mtime->tv_sec, mtime->tv_nsec);
+                                inode->i_mtime = *mtime;
+                        }
+                        if (timespec_compare(atime, &inode->i_atime) > 0) {
+                                dout("atime %ld.%09ld -> %ld.%09ld inc\n",
+                                     inode->i_atime.tv_sec,
+                                     inode->i_atime.tv_nsec,
+                                     atime->tv_sec, atime->tv_nsec);
+                                inode->i_atime = *atime;
+                        }
+                } else if (issued & CEPH_CAP_FILE_EXCL) {
+                        /* we did a utimes(); ignore mds values */
+                } else {
+                        warn = 1;
+                }
+        } else {
+                /* we have no write caps; whatever the MDS says is true */
+                if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
+                        inode->i_ctime = *ctime;
+                        inode->i_mtime = *mtime;
+                        inode->i_atime = *atime;
+                        ci->i_time_warp_seq = time_warp_seq;
+                } else {
+                        warn = 1;
+                }
+        }
+        if (warn) /* time_warp_seq shouldn't go backwards */
+                dout("%p mds time_warp_seq %llu < %u\n",
+                     inode, time_warp_seq, ci->i_time_warp_seq);
+}
+/*
+ * Populate an inode based on info from mds.  May be called on new or
+ * existing inodes.
+ */
+static int fill_inode(struct inode *inode,
+                      struct ceph_mds_reply_info_in *iinfo,
+                      struct ceph_mds_reply_dirfrag *dirinfo,
+                      struct ceph_mds_session *session,
+                      unsigned long ttl_from, int cap_fmode,
+                      struct ceph_cap_reservation *caps_reservation)
+{
+        struct ceph_mds_reply_inode *info = iinfo->in;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int i;
+        int issued, implemented;
+        struct timespec mtime, atime, ctime;
+        u32 nsplits;
+        struct ceph_buffer *xattr_blob = NULL;
+        int err = 0;
+        int queue_trunc = 0;
+        dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+             inode, ceph_vinop(inode), le64_to_cpu(info->version),
+             ci->i_version);
+        /*
+         * prealloc xattr data, if it looks like we'll need it.  only
+         * if len > 4 (meaning there are actually xattrs; the first 4
+         * bytes are the xattr count).
+         */
+        if (iinfo->xattr_len > 4) {
+                xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
+                if (!xattr_blob)
+                        pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+                               iinfo->xattr_len);
+        }
+        spin_lock(&inode->i_lock);
+        /*
+         * provided version will be odd if inode value is projected,
+         * even if stable.  skip the update if we have a newer info
+         * (e.g., due to inode info racing form multiple MDSs), or if
+         * we are getting projected (unstable) inode info.
+         */
+        if (le64_to_cpu(info->version) > 0 &&
+            (ci->i_version & ~1) > le64_to_cpu(info->version))
+                goto no_change;
+        issued = __ceph_caps_issued(ci, &implemented);
+        issued |= implemented | __ceph_caps_dirty(ci);
+        /* update inode */
+        ci->i_version = le64_to_cpu(info->version);
+        inode->i_version++;
+        inode->i_rdev = le32_to_cpu(info->rdev);
+        if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+                inode->i_mode = le32_to_cpu(info->mode);
+                inode->i_uid = le32_to_cpu(info->uid);
+                inode->i_gid = le32_to_cpu(info->gid);
+                dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+                     inode->i_uid, inode->i_gid);
+        }
+        if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+                inode->i_nlink = le32_to_cpu(info->nlink);
+        /* be careful with mtime, atime, size */
+        ceph_decode_timespec(&atime, &info->atime);
+        ceph_decode_timespec(&mtime, &info->mtime);
+        ceph_decode_timespec(&ctime, &info->ctime);
+        queue_trunc = ceph_fill_file_size(inode, issued,
+                                          le32_to_cpu(info->truncate_seq),
+                                          le64_to_cpu(info->truncate_size),
+                                          le64_to_cpu(info->size));
+        ceph_fill_file_time(inode, issued,
+                            le32_to_cpu(info->time_warp_seq),
+                            &ctime, &mtime, &atime);
+        ci->i_max_size = le64_to_cpu(info->max_size);
+        ci->i_layout = info->layout;
+        inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+        /* xattrs */
+        /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
+        if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
+            le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
+                if (ci->i_xattrs.blob)
+                        ceph_buffer_put(ci->i_xattrs.blob);
+                ci->i_xattrs.blob = xattr_blob;
+                if (xattr_blob)
+                        memcpy(ci->i_xattrs.blob->vec.iov_base,
+                               iinfo->xattr_data, iinfo->xattr_len);
+                ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+        }
+        inode->i_mapping->a_ops = &ceph_aops;
+        inode->i_mapping->backing_dev_info =
+                &ceph_client(inode->i_sb)->backing_dev_info;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFIFO:
+        case S_IFBLK:
+        case S_IFCHR:
+        case S_IFSOCK:
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                inode->i_op = &ceph_file_iops;
+                break;
+        case S_IFREG:
+                inode->i_op = &ceph_file_iops;
+                inode->i_fop = &ceph_file_fops;
+                break;
+        case S_IFLNK:
+                inode->i_op = &ceph_symlink_iops;
+                if (!ci->i_symlink) {
+                        int symlen = iinfo->symlink_len;
+                        char *sym;
+                        BUG_ON(symlen != inode->i_size);
+                        spin_unlock(&inode->i_lock);
+                        err = -ENOMEM;
+                        sym = kmalloc(symlen+1, GFP_NOFS);
+                        if (!sym)
+                                goto out;
+                        memcpy(sym, iinfo->symlink, symlen);
+                        sym[symlen] = 0;
+                        spin_lock(&inode->i_lock);
+                        if (!ci->i_symlink)
+                                ci->i_symlink = sym;
+                        else
+                                kfree(sym); /* lost a race */
+                }
+                break;
+        case S_IFDIR:
+                inode->i_op = &ceph_dir_iops;
+                inode->i_fop = &ceph_dir_fops;
+                ci->i_files = le64_to_cpu(info->files);
+                ci->i_subdirs = le64_to_cpu(info->subdirs);
+                ci->i_rbytes = le64_to_cpu(info->rbytes);
+                ci->i_rfiles = le64_to_cpu(info->rfiles);
+                ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+                ceph_decode_timespec(&ci->i_rctime, &info->rctime);
+                /* set dir completion flag? */
+                if (ci->i_files == 0 && ci->i_subdirs == 0 &&
+                    ceph_snap(inode) == CEPH_NOSNAP &&
+                    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
+                        dout(" marking %p complete (empty)\n", inode);
+                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
+                        ci->i_max_offset = 2;
+                }
+                /* it may be better to set st_size in getattr instead? */
+                if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
+                        inode->i_size = ci->i_rbytes;
+                break;
+        default:
+                pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+                       ceph_vinop(inode), inode->i_mode);
+        }
+no_change:
+        spin_unlock(&inode->i_lock);
+        /* queue truncate if we saw i_size decrease */
+        if (queue_trunc)
+                ceph_queue_vmtruncate(inode);
+        /* populate frag tree */
+        /* FIXME: move me up, if/when version reflects fragtree changes */
+        nsplits = le32_to_cpu(info->fragtree.nsplits);
+        mutex_lock(&ci->i_fragtree_mutex);
+        for (i = 0; i < nsplits; i++) {
+                u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
+                struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
+                if (IS_ERR(frag))
+                        continue;
+                frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
+                dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+        }
+        mutex_unlock(&ci->i_fragtree_mutex);
+        /* were we issued a capability? */
+        if (info->cap.caps) {
+                if (ceph_snap(inode) == CEPH_NOSNAP) {
+                        ceph_add_cap(inode, session,
+                                     le64_to_cpu(info->cap.cap_id),
+                                     cap_fmode,
+                                     le32_to_cpu(info->cap.caps),
+                                     le32_to_cpu(info->cap.wanted),
+                                     le32_to_cpu(info->cap.seq),
+                                     le32_to_cpu(info->cap.mseq),
+                                     le64_to_cpu(info->cap.realm),
+                                     info->cap.flags,
+                                     caps_reservation);
+                } else {
+                        spin_lock(&inode->i_lock);
+                        dout(" %p got snap_caps %s\n", inode,
+                             ceph_cap_string(le32_to_cpu(info->cap.caps)));
+                        ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
+                        if (cap_fmode >= 0)
+                                __ceph_get_fmode(ci, cap_fmode);
+                        spin_unlock(&inode->i_lock);
+                }
+        } else if (cap_fmode >= 0) {
+                pr_warning("mds issued no caps on %llx.%llx\n",
+                           ceph_vinop(inode));
+                __ceph_get_fmode(ci, cap_fmode);
+        }
+        /* update delegation info? */
+        if (dirinfo)
+                ceph_fill_dirfrag(inode, dirinfo);
+        err = 0;
+out:
+        if (xattr_blob)
+                ceph_buffer_put(xattr_blob);
+        return err;
+}
+/*
+ * caller should hold session s_mutex.
+ */
+static void update_dentry_lease(struct dentry *dentry,
+                                struct ceph_mds_reply_lease *lease,
+                                struct ceph_mds_session *session,
+                                unsigned long from_time)
+{
+        struct ceph_dentry_info *di = ceph_dentry(dentry);
+        long unsigned duration = le32_to_cpu(lease->duration_ms);
+        long unsigned ttl = from_time + (duration * HZ) / 1000;
+        long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
+        struct inode *dir;
+        /* only track leases on regular dentries */
+        if (dentry->d_op != &ceph_dentry_ops)
+                return;
+        spin_lock(&dentry->d_lock);
+        dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
+             dentry, le16_to_cpu(lease->mask), duration, ttl);
+        /* make lease_rdcache_gen match directory */
+        dir = dentry->d_parent->d_inode;
+        di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
+        if (lease->mask == 0)
+                goto out_unlock;
+        if (di->lease_gen == session->s_cap_gen &&
+            time_before(ttl, dentry->d_time))
+                goto out_unlock;  /* we already have a newer lease. */
+        if (di->lease_session && di->lease_session != session)
+                goto out_unlock;
+        ceph_dentry_lru_touch(dentry);
+        if (!di->lease_session)
+                di->lease_session = ceph_get_mds_session(session);
+        di->lease_gen = session->s_cap_gen;
+        di->lease_seq = le32_to_cpu(lease->seq);
+        di->lease_renew_after = half_ttl;
+        di->lease_renew_from = 0;
+        dentry->d_time = ttl;
+out_unlock:
+        spin_unlock(&dentry->d_lock);
+        return;
+}
+/*
+ * splice a dentry to an inode.
+ * caller must hold directory i_mutex for this to be safe.
+ *
+ * we will only rehash the resulting dentry if @prehash is
+ * true; @prehash will be set to false (for the benefit of
+ * the caller) if we fail.
+ */
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
+                                    bool *prehash)
+{
+        struct dentry *realdn;
+        /* dn must be unhashed */
+        if (!d_unhashed(dn))
+                d_drop(dn);
+        realdn = d_materialise_unique(dn, in);
+        if (IS_ERR(realdn)) {
+                pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
+                       dn, in, ceph_vinop(in));
+                if (prehash)
+                        *prehash = false; /* don't rehash on error */
+                dn = realdn; /* note realdn contains the error */
+                goto out;
+        } else if (realdn) {
+                dout("dn %p (%d) spliced with %p (%d) "
+                     "inode %p ino %llx.%llx\n",
+                     dn, atomic_read(&dn->d_count),
+                     realdn, atomic_read(&realdn->d_count),
+                     realdn->d_inode, ceph_vinop(realdn->d_inode));
+                dput(dn);
+                dn = realdn;
+        } else {
+                BUG_ON(!ceph_dentry(dn));
+                dout("dn %p attached to %p ino %llx.%llx\n",
+                     dn, dn->d_inode, ceph_vinop(dn->d_inode));
+        }
+        if ((!prehash || *prehash) && d_unhashed(dn))
+                d_rehash(dn);
+out:
+        return dn;
+}
+/*
+ * Set dentry's directory position based on the current dir's max, and
+ * order it in d_subdirs, so that dcache_readdir behaves.
+ */
+static void ceph_set_dentry_offset(struct dentry *dn)
+{
+        struct dentry *dir = dn->d_parent;
+        struct inode *inode = dn->d_parent->d_inode;
+        struct ceph_dentry_info *di;
+        BUG_ON(!inode);
+        di = ceph_dentry(dn);
+        spin_lock(&inode->i_lock);
+        di->offset = ceph_inode(inode)->i_max_offset++;
+        spin_unlock(&inode->i_lock);
+        spin_lock(&dcache_lock);
+        spin_lock(&dn->d_lock);
+        list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
+             dn->d_u.d_child.prev, dn->d_u.d_child.next);
+        spin_unlock(&dn->d_lock);
+        spin_unlock(&dcache_lock);
+}
+/*
+ * Incorporate results into the local cache.  This is either just
+ * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
+ * after a lookup).
+ *
+ * A reply may contain
+ *         a directory inode along with a dentry.
+ *  and/or a target inode
+ *
+ * Called with snap_rwsem (read).
+ */
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
+                    struct ceph_mds_session *session)
+{
+        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+        struct inode *in = NULL;
+        struct ceph_mds_reply_inode *ininfo;
+        struct ceph_vino vino;
+        struct ceph_client *client = ceph_sb_to_client(sb);
+        int i = 0;
+        int err = 0;
+        dout("fill_trace %p is_dentry %d is_target %d\n", req,
+             rinfo->head->is_dentry, rinfo->head->is_target);
+#if 0
+        /*
+         * Debugging hook:
+         *
+         * If we resend completed ops to a recovering mds, we get no
+         * trace.  Since that is very rare, pretend this is the case
+         * to ensure the 'no trace' handlers in the callers behave.
+         *
+         * Fill in inodes unconditionally to avoid breaking cap
+         * invariants.
+         */
+        if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
+                pr_info("fill_trace faking empty trace on %lld %s\n",
+                        req->r_tid, ceph_mds_op_name(rinfo->head->op));
+                if (rinfo->head->is_dentry) {
+                        rinfo->head->is_dentry = 0;
+                        err = fill_inode(req->r_locked_dir,
+                                         &rinfo->diri, rinfo->dirfrag,
+                                         session, req->r_request_started, -1);
+                }
+                if (rinfo->head->is_target) {
+                        rinfo->head->is_target = 0;
+                        ininfo = rinfo->targeti.in;
+                        vino.ino = le64_to_cpu(ininfo->ino);
+                        vino.snap = le64_to_cpu(ininfo->snapid);
+                        in = ceph_get_inode(sb, vino);
+                        err = fill_inode(in, &rinfo->targeti, NULL,
+                                         session, req->r_request_started,
+                                         req->r_fmode);
+                        iput(in);
+                }
+        }
+#endif
+        if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
+                dout("fill_trace reply is empty!\n");
+                if (rinfo->head->result == 0 && req->r_locked_dir) {
+                        struct ceph_inode_info *ci =
+                                ceph_inode(req->r_locked_dir);
+                        dout(" clearing %p complete (empty trace)\n",
+                             req->r_locked_dir);
+                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+                        ci->i_release_count++;
+                }
+                return 0;
+        }
+        if (rinfo->head->is_dentry) {
+                struct inode *dir = req->r_locked_dir;
+                err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+                                 session, req->r_request_started, -1,
+                                 &req->r_caps_reservation);
+                if (err < 0)
+                        return err;
+        }
+        /*
+         * ignore null lease/binding on snapdir ENOENT, or else we
+         * will have trouble splicing in the virtual snapdir later
+         */
+        if (rinfo->head->is_dentry && !req->r_aborted &&
+            (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
+                                               client->mount_args->snapdir_name,
+                                               req->r_dentry->d_name.len))) {
+                /*
+                 * lookup link rename   : null -> possibly existing inode
+                 * mknod symlink mkdir  : null -> new inode
+                 * unlink               : linked -> null
+                 */
+                struct inode *dir = req->r_locked_dir;
+                struct dentry *dn = req->r_dentry;
+                bool have_dir_cap, have_lease;
+                BUG_ON(!dn);
+                BUG_ON(!dir);
+                BUG_ON(dn->d_parent->d_inode != dir);
+                BUG_ON(ceph_ino(dir) !=
+                       le64_to_cpu(rinfo->diri.in->ino));
+                BUG_ON(ceph_snap(dir) !=
+                       le64_to_cpu(rinfo->diri.in->snapid));
+                /* do we have a lease on the whole dir? */
+                have_dir_cap =
+                        (le32_to_cpu(rinfo->diri.in->cap.caps) &
+                         CEPH_CAP_FILE_SHARED);
+                /* do we have a dn lease? */
+                have_lease = have_dir_cap ||
+                        (le16_to_cpu(rinfo->dlease->mask) &
+                         CEPH_LOCK_DN);
+                if (!have_lease)
+                        dout("fill_trace  no dentry lease or dir cap\n");
+                /* rename? */
+                if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+                        dout(" src %p '%.*s' dst %p '%.*s'\n",
+                             req->r_old_dentry,
+                             req->r_old_dentry->d_name.len,
+                             req->r_old_dentry->d_name.name,
+                             dn, dn->d_name.len, dn->d_name.name);
+                        dout("fill_trace doing d_move %p -> %p\n",
+                             req->r_old_dentry, dn);
+                        /* d_move screws up d_subdirs order */
+                        ceph_i_clear(dir, CEPH_I_COMPLETE);
+                        d_move(req->r_old_dentry, dn);
+                        dout(" src %p '%.*s' dst %p '%.*s'\n",
+                             req->r_old_dentry,
+                             req->r_old_dentry->d_name.len,
+                             req->r_old_dentry->d_name.name,
+                             dn, dn->d_name.len, dn->d_name.name);
+                        /* ensure target dentry is invalidated, despite
+                           rehashing bug in vfs_rename_dir */
+                        dn->d_time = jiffies;
+                        ceph_dentry(dn)->lease_shared_gen = 0;
+                        /* take overwritten dentry's readdir offset */
+                        ceph_dentry(req->r_old_dentry)->offset =
+                                ceph_dentry(dn)->offset;
+                        dn = req->r_old_dentry;  /* use old_dentry */
+                        in = dn->d_inode;
+                }
+                /* null dentry? */
+                if (!rinfo->head->is_target) {
+                        dout("fill_trace null dentry\n");
+                        if (dn->d_inode) {
+                                dout("d_delete %p\n", dn);
+                                d_delete(dn);
+                        } else {
+                                dout("d_instantiate %p NULL\n", dn);
+                                d_instantiate(dn, NULL);
+                                if (have_lease && d_unhashed(dn))
+                                        d_rehash(dn);
+                                update_dentry_lease(dn, rinfo->dlease,
+                                                    session,
+                                                    req->r_request_started);
+                        }
+                        goto done;
+                }
+                /* attach proper inode */
+                ininfo = rinfo->targeti.in;
+                vino.ino = le64_to_cpu(ininfo->ino);
+                vino.snap = le64_to_cpu(ininfo->snapid);
+                if (!dn->d_inode) {
+                        in = ceph_get_inode(sb, vino);
+                        if (IS_ERR(in)) {
+                                pr_err("fill_trace bad get_inode "
+                                       "%llx.%llx\n", vino.ino, vino.snap);
+                                err = PTR_ERR(in);
+                                d_delete(dn);
+                                goto done;
+                        }
+                        dn = splice_dentry(dn, in, &have_lease);
+                        if (IS_ERR(dn)) {
+                                err = PTR_ERR(dn);
+                                goto done;
+                        }
+                        req->r_dentry = dn;  /* may have spliced */
+                        ceph_set_dentry_offset(dn);
+                        igrab(in);
+                } else if (ceph_ino(in) == vino.ino &&
+                           ceph_snap(in) == vino.snap) {
+                        igrab(in);
+                } else {
+                        dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
+                             dn, in, ceph_ino(in), ceph_snap(in),
+                             vino.ino, vino.snap);
+                        have_lease = false;
+                        in = NULL;
+                }
+                if (have_lease)
+                        update_dentry_lease(dn, rinfo->dlease, session,
+                                            req->r_request_started);
+                dout(" final dn %p\n", dn);
+                i++;
+        } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+                   req->r_op == CEPH_MDS_OP_MKSNAP) {
+                struct dentry *dn = req->r_dentry;
+                /* fill out a snapdir LOOKUPSNAP dentry */
+                BUG_ON(!dn);
+                BUG_ON(!req->r_locked_dir);
+                BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
+                ininfo = rinfo->targeti.in;
+                vino.ino = le64_to_cpu(ininfo->ino);
+                vino.snap = le64_to_cpu(ininfo->snapid);
+                in = ceph_get_inode(sb, vino);
+                if (IS_ERR(in)) {
+                        pr_err("fill_inode get_inode badness %llx.%llx\n",
+                               vino.ino, vino.snap);
+                        err = PTR_ERR(in);
+                        d_delete(dn);
+                        goto done;
+                }
+                dout(" linking snapped dir %p to dn %p\n", in, dn);
+                dn = splice_dentry(dn, in, NULL);
+                if (IS_ERR(dn)) {
+                        err = PTR_ERR(dn);
+                        goto done;
+                }
+                ceph_set_dentry_offset(dn);
+                req->r_dentry = dn;  /* may have spliced */
+                igrab(in);
+                rinfo->head->is_dentry = 1;  /* fool notrace handlers */
+        }
+        if (rinfo->head->is_target) {
+                vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+                vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+                if (in == NULL || ceph_ino(in) != vino.ino ||
+                    ceph_snap(in) != vino.snap) {
+                        in = ceph_get_inode(sb, vino);
+                        if (IS_ERR(in)) {
+                                err = PTR_ERR(in);
+                                goto done;
+                        }
+                }
+                req->r_target_inode = in;
+                err = fill_inode(in,
+                                 &rinfo->targeti, NULL,
+                                 session, req->r_request_started,
+                                 (le32_to_cpu(rinfo->head->result) == 0) ?
+                                 req->r_fmode : -1,
+                                 &req->r_caps_reservation);
+                if (err < 0) {
+                        pr_err("fill_inode badness %p %llx.%llx\n",
+                               in, ceph_vinop(in));
+                        goto done;
+                }
+        }
+done:
+        dout("fill_trace done err=%d\n", err);
+        return err;
+}
+/*
+ * Prepopulate our cache with readdir results, leases, etc.
+ */
+int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+                             struct ceph_mds_session *session)
+{
+        struct dentry *parent = req->r_dentry;
+        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+        struct qstr dname;
+        struct dentry *dn;
+        struct inode *in;
+        int err = 0, i;
+        struct inode *snapdir = NULL;
+        struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
+        u64 frag = le32_to_cpu(rhead->args.readdir.frag);
+        struct ceph_dentry_info *di;
+        if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
+                snapdir = ceph_get_snapdir(parent->d_inode);
+                parent = d_find_alias(snapdir);
+                dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
+                     rinfo->dir_nr, parent);
+        } else {
+                dout("readdir_prepopulate %d items under dn %p\n",
+                     rinfo->dir_nr, parent);
+                if (rinfo->dir_dir)
+                        ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
+        }
+        for (i = 0; i < rinfo->dir_nr; i++) {
+                struct ceph_vino vino;
+                dname.name = rinfo->dir_dname[i];
+                dname.len = rinfo->dir_dname_len[i];
+                dname.hash = full_name_hash(dname.name, dname.len);
+                vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+                vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+retry_lookup:
+                dn = d_lookup(parent, &dname);
+                dout("d_lookup on parent=%p name=%.*s got %p\n",
+                     parent, dname.len, dname.name, dn);
+                if (!dn) {
+                        dn = d_alloc(parent, &dname);
+                        dout("d_alloc %p '%.*s' = %p\n", parent,
+                             dname.len, dname.name, dn);
+                        if (dn == NULL) {
+                                dout("d_alloc badness\n");
+                                err = -ENOMEM;
+                                goto out;
+                        }
+                        err = ceph_init_dentry(dn);
+                        if (err < 0)
+                                goto out;
+                } else if (dn->d_inode &&
+                           (ceph_ino(dn->d_inode) != vino.ino ||
+                            ceph_snap(dn->d_inode) != vino.snap)) {
+                        dout(" dn %p points to wrong inode %p\n",
+                             dn, dn->d_inode);
+                        d_delete(dn);
+                        dput(dn);
+                        goto retry_lookup;
+                } else {
+                        /* reorder parent's d_subdirs */
+                        spin_lock(&dcache_lock);
+                        spin_lock(&dn->d_lock);
+                        list_move(&dn->d_u.d_child, &parent->d_subdirs);
+                        spin_unlock(&dn->d_lock);
+                        spin_unlock(&dcache_lock);
+                }
+                di = dn->d_fsdata;
+                di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
+                /* inode */
+                if (dn->d_inode) {
+                        in = dn->d_inode;
+                } else {
+                        in = ceph_get_inode(parent->d_sb, vino);
+                        if (in == NULL) {
+                                dout("new_inode badness\n");
+                                d_delete(dn);
+                                dput(dn);
+                                err = -ENOMEM;
+                                goto out;
+                        }
+                        dn = splice_dentry(dn, in, NULL);
+                }
+                if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
+                               req->r_request_started, -1,
+                               &req->r_caps_reservation) < 0) {
+                        pr_err("fill_inode badness on %p\n", in);
+                        dput(dn);
+                        continue;
+                }
+                update_dentry_lease(dn, rinfo->dir_dlease[i],
+                                    req->r_session, req->r_request_started);
+                dput(dn);
+        }
+        req->r_did_prepopulate = true;
+out:
+        if (snapdir) {
+                iput(snapdir);
+                dput(parent);
+        }
+        dout("readdir_prepopulate done\n");
+        return err;
+}
+int ceph_inode_set_size(struct inode *inode, loff_t size)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int ret = 0;
+        spin_lock(&inode->i_lock);
+        dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+        inode->i_size = size;
+        inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+        /* tell the MDS if we are approaching max_size */
+        if ((size << 1) >= ci->i_max_size &&
+            (ci->i_reported_size << 1) < ci->i_max_size)
+                ret = 1;
+        spin_unlock(&inode->i_lock);
+        return ret;
+}
+/*
+ * Write back inode data in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+void ceph_queue_writeback(struct inode *inode)
+{
+        if (queue_work(ceph_inode_to_client(inode)->wb_wq,
+                       &ceph_inode(inode)->i_wb_work)) {
+                dout("ceph_queue_writeback %p\n", inode);
+                igrab(inode);
+        } else {
+                dout("ceph_queue_writeback %p failed\n", inode);
+        }
+}
+static void ceph_writeback_work(struct work_struct *work)
+{
+        struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+                                                  i_wb_work);
+        struct inode *inode = &ci->vfs_inode;
+        dout("writeback %p\n", inode);
+        filemap_fdatawrite(&inode->i_data);
+        iput(inode);
+}
+/*
+ * queue an async invalidation
+ */
+void ceph_queue_invalidate(struct inode *inode)
+{
+        if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
+                       &ceph_inode(inode)->i_pg_inv_work)) {
+                dout("ceph_queue_invalidate %p\n", inode);
+                igrab(inode);
+        } else {
+                dout("ceph_queue_invalidate %p failed\n", inode);
+        }
+}
+/*
+ * invalidate any pages that are not dirty or under writeback.  this
+ * includes pages that are clean and mapped.
+ */
+static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
+{
+        struct pagevec pvec;
+        pgoff_t next = 0;
+        int i;
+        pagevec_init(&pvec, 0);
+        while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+                for (i = 0; i < pagevec_count(&pvec); i++) {
+                        struct page *page = pvec.pages[i];
+                        pgoff_t index;
+                        int skip_page =
+                                (PageDirty(page) || PageWriteback(page));
+                        if (!skip_page)
+                                skip_page = !trylock_page(page);
+                        /*
+                         * We really shouldn't be looking at the ->index of an
+                         * unlocked page.  But we're not allowed to lock these
+                         * pages.  So we rely upon nobody altering the ->index
+                         * of this (pinned-by-us) page.
+                         */
+                        index = page->index;
+                        if (index > next)
+                                next = index;
+                        next++;
+                        if (skip_page)
+                                continue;
+                        generic_error_remove_page(mapping, page);
+                        unlock_page(page);
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+}
+/*
+ * Invalidate inode pages in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+static void ceph_invalidate_work(struct work_struct *work)
+{
+        struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+                                                  i_pg_inv_work);
+        struct inode *inode = &ci->vfs_inode;
+        u32 orig_gen;
+        int check = 0;
+        spin_lock(&inode->i_lock);
+        dout("invalidate_pages %p gen %d revoking %d\n", inode,
+             ci->i_rdcache_gen, ci->i_rdcache_revoking);
+        if (ci->i_rdcache_gen == 0 ||
+            ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+                BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
+                /* nevermind! */
+                ci->i_rdcache_revoking = 0;
+                spin_unlock(&inode->i_lock);
+                goto out;
+        }
+        orig_gen = ci->i_rdcache_gen;
+        spin_unlock(&inode->i_lock);
+        ceph_invalidate_nondirty_pages(inode->i_mapping);
+        spin_lock(&inode->i_lock);
+        if (orig_gen == ci->i_rdcache_gen) {
+                dout("invalidate_pages %p gen %d successful\n", inode,
+                     ci->i_rdcache_gen);
+                ci->i_rdcache_gen = 0;
+                ci->i_rdcache_revoking = 0;
+                check = 1;
+        } else {
+                dout("invalidate_pages %p gen %d raced, gen now %d\n",
+                     inode, orig_gen, ci->i_rdcache_gen);
+        }
+        spin_unlock(&inode->i_lock);
+        if (check)
+                ceph_check_caps(ci, 0, NULL);
+out:
+        iput(inode);
+}
+/*
+ * called by trunc_wq; take i_mutex ourselves
+ *
+ * We also truncate in a separate thread as well.
+ */
+static void ceph_vmtruncate_work(struct work_struct *work)
+{
+        struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+                                                  i_vmtruncate_work);
+        struct inode *inode = &ci->vfs_inode;
+        dout("vmtruncate_work %p\n", inode);
+        mutex_lock(&inode->i_mutex);
+        __ceph_do_pending_vmtruncate(inode);
+        mutex_unlock(&inode->i_mutex);
+        iput(inode);
+}
+/*
+ * Queue an async vmtruncate.  If we fail to queue work, we will handle
+ * the truncation the next time we call __ceph_do_pending_vmtruncate.
+ */
+void ceph_queue_vmtruncate(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+                       &ci->i_vmtruncate_work)) {
+                dout("ceph_queue_vmtruncate %p\n", inode);
+                igrab(inode);
+        } else {
+                dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
+                     inode, ci->i_truncate_pending);
+        }
+}
+/*
+ * called with i_mutex held.
+ *
+ * Make sure any pending truncation is applied before doing anything
+ * that may depend on it.
+ */
+void __ceph_do_pending_vmtruncate(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        u64 to;
+        int wrbuffer_refs, wake = 0;
+retry:
+        spin_lock(&inode->i_lock);
+        if (ci->i_truncate_pending == 0) {
+                dout("__do_pending_vmtruncate %p none pending\n", inode);
+                spin_unlock(&inode->i_lock);
+                return;
+        }
+        /*
+         * make sure any dirty snapped pages are flushed before we
+         * possibly truncate them.. so write AND block!
+         */
+        if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
+                dout("__do_pending_vmtruncate %p flushing snaps first\n",
+                     inode);
+                spin_unlock(&inode->i_lock);
+                filemap_write_and_wait_range(&inode->i_data, 0,
+                                             inode->i_sb->s_maxbytes);
+                goto retry;
+        }
+        to = ci->i_truncate_size;
+        wrbuffer_refs = ci->i_wrbuffer_ref;
+        dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
+             ci->i_truncate_pending, to);
+        spin_unlock(&inode->i_lock);
+        truncate_inode_pages(inode->i_mapping, to);
+        spin_lock(&inode->i_lock);
+        ci->i_truncate_pending--;
+        if (ci->i_truncate_pending == 0)
+                wake = 1;
+        spin_unlock(&inode->i_lock);
+        if (wrbuffer_refs == 0)
+                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+        if (wake)
+                wake_up(&ci->i_cap_wq);
+}
+/*
+ * symlinks
+ */
+static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
+        nd_set_link(nd, ci->i_symlink);
+        return NULL;
+}
+static const struct inode_operations ceph_symlink_iops = {
+        .readlink = generic_readlink,
+        .follow_link = ceph_sym_follow_link,
+};
+/*
+ * setattr
+ */
+int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct inode *parent_inode = dentry->d_parent->d_inode;
+        const unsigned int ia_valid = attr->ia_valid;
+        struct ceph_mds_request *req;
+        struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
+        int issued;
+        int release = 0, dirtied = 0;
+        int mask = 0;
+        int err = 0;
+        if (ceph_snap(inode) != CEPH_NOSNAP)
+                return -EROFS;
+        __ceph_do_pending_vmtruncate(inode);
+        err = inode_change_ok(inode, attr);
+        if (err != 0)
+                return err;
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
+                                       USE_AUTH_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        spin_lock(&inode->i_lock);
+        issued = __ceph_caps_issued(ci, NULL);
+        dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+        if (ia_valid & ATTR_UID) {
+                dout("setattr %p uid %d -> %d\n", inode,
+                     inode->i_uid, attr->ia_uid);
+                if (issued & CEPH_CAP_AUTH_EXCL) {
+                        inode->i_uid = attr->ia_uid;
+                        dirtied |= CEPH_CAP_AUTH_EXCL;
+                } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+                           attr->ia_uid != inode->i_uid) {
+                        req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
+                        mask |= CEPH_SETATTR_UID;
+                        release |= CEPH_CAP_AUTH_SHARED;
+                }
+        }
+        if (ia_valid & ATTR_GID) {
+                dout("setattr %p gid %d -> %d\n", inode,
+                     inode->i_gid, attr->ia_gid);
+                if (issued & CEPH_CAP_AUTH_EXCL) {
+                        inode->i_gid = attr->ia_gid;
+                        dirtied |= CEPH_CAP_AUTH_EXCL;
+                } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+                           attr->ia_gid != inode->i_gid) {
+                        req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
+                        mask |= CEPH_SETATTR_GID;
+                        release |= CEPH_CAP_AUTH_SHARED;
+                }
+        }
+        if (ia_valid & ATTR_MODE) {
+                dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
+                     attr->ia_mode);
+                if (issued & CEPH_CAP_AUTH_EXCL) {
+                        inode->i_mode = attr->ia_mode;
+                        dirtied |= CEPH_CAP_AUTH_EXCL;
+                } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+                           attr->ia_mode != inode->i_mode) {
+                        req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
+                        mask |= CEPH_SETATTR_MODE;
+                        release |= CEPH_CAP_AUTH_SHARED;
+                }
+        }
+        if (ia_valid & ATTR_ATIME) {
+                dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
+                     inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
+                     attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+                if (issued & CEPH_CAP_FILE_EXCL) {
+                        ci->i_time_warp_seq++;
+                        inode->i_atime = attr->ia_atime;
+                        dirtied |= CEPH_CAP_FILE_EXCL;
+                } else if ((issued & CEPH_CAP_FILE_WR) &&
+                           timespec_compare(&inode->i_atime,
+                                            &attr->ia_atime) < 0) {
+                        inode->i_atime = attr->ia_atime;
+                        dirtied |= CEPH_CAP_FILE_WR;
+                } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+                           !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
+                        ceph_encode_timespec(&req->r_args.setattr.atime,
+                                             &attr->ia_atime);
+                        mask |= CEPH_SETATTR_ATIME;
+                        release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
+                                CEPH_CAP_FILE_WR;
+                }
+        }
+        if (ia_valid & ATTR_MTIME) {
+                dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
+                     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+                     attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+                if (issued & CEPH_CAP_FILE_EXCL) {
+                        ci->i_time_warp_seq++;
+                        inode->i_mtime = attr->ia_mtime;
+                        dirtied |= CEPH_CAP_FILE_EXCL;
+                } else if ((issued & CEPH_CAP_FILE_WR) &&
+                           timespec_compare(&inode->i_mtime,
+                                            &attr->ia_mtime) < 0) {
+                        inode->i_mtime = attr->ia_mtime;
+                        dirtied |= CEPH_CAP_FILE_WR;
+                } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+                           !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
+                        ceph_encode_timespec(&req->r_args.setattr.mtime,
+                                             &attr->ia_mtime);
+                        mask |= CEPH_SETATTR_MTIME;
+                        release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+                                CEPH_CAP_FILE_WR;
+                }
+        }
+        if (ia_valid & ATTR_SIZE) {
+                dout("setattr %p size %lld -> %lld\n", inode,
+                     inode->i_size, attr->ia_size);
+                if (attr->ia_size > inode->i_sb->s_maxbytes) {
+                        err = -EINVAL;
+                        goto out;
+                }
+                if ((issued & CEPH_CAP_FILE_EXCL) &&
+                    attr->ia_size > inode->i_size) {
+                        inode->i_size = attr->ia_size;
+                        inode->i_blocks =
+                                (attr->ia_size + (1 << 9) - 1) >> 9;
+                        inode->i_ctime = attr->ia_ctime;
+                        ci->i_reported_size = attr->ia_size;
+                        dirtied |= CEPH_CAP_FILE_EXCL;
+                } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+                           attr->ia_size != inode->i_size) {
+                        req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+                        req->r_args.setattr.old_size =
+                                cpu_to_le64(inode->i_size);
+                        mask |= CEPH_SETATTR_SIZE;
+                        release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+                                CEPH_CAP_FILE_WR;
+                }
+        }
+        /* these do nothing */
+        if (ia_valid & ATTR_CTIME) {
+                bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
+                                         ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
+                dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
+                     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+                     attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+                     only ? "ctime only" : "ignored");
+                inode->i_ctime = attr->ia_ctime;
+                if (only) {
+                        /*
+                         * if kernel wants to dirty ctime but nothing else,
+                         * we need to choose a cap to dirty under, or do
+                         * a almost-no-op setattr
+                         */
+                        if (issued & CEPH_CAP_AUTH_EXCL)
+                                dirtied |= CEPH_CAP_AUTH_EXCL;
+                        else if (issued & CEPH_CAP_FILE_EXCL)
+                                dirtied |= CEPH_CAP_FILE_EXCL;
+                        else if (issued & CEPH_CAP_XATTR_EXCL)
+                                dirtied |= CEPH_CAP_XATTR_EXCL;
+                        else
+                                mask |= CEPH_SETATTR_CTIME;
+                }
+        }
+        if (ia_valid & ATTR_FILE)
+                dout("setattr %p ATTR_FILE ... hrm!\n", inode);
+        if (dirtied) {
+                __ceph_mark_dirty_caps(ci, dirtied);
+                inode->i_ctime = CURRENT_TIME;
+        }
+        release &= issued;
+        spin_unlock(&inode->i_lock);
+        if (mask) {
+                req->r_inode = igrab(inode);
+                req->r_inode_drop = release;
+                req->r_args.setattr.mask = cpu_to_le32(mask);
+                req->r_num_caps = 1;
+                err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        }
+        dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
+             ceph_cap_string(dirtied), mask);
+        ceph_mdsc_put_request(req);
+        __ceph_do_pending_vmtruncate(inode);
+        return err;
+out:
+        spin_unlock(&inode->i_lock);
+        ceph_mdsc_put_request(req);
+        return err;
+}
+/*
+ * Verify that we have a lease on the given mask.  If not,
+ * do a getattr against an mds.
+ */
+int ceph_do_getattr(struct inode *inode, int mask)
+{
+        struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int err;
+        if (ceph_snap(inode) == CEPH_SNAPDIR) {
+                dout("do_getattr inode %p SNAPDIR\n", inode);
+                return 0;
+        }
+        dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+        if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
+                return 0;
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->r_inode = igrab(inode);
+        req->r_num_caps = 1;
+        req->r_args.getattr.mask = cpu_to_le32(mask);
+        err = ceph_mdsc_do_request(mdsc, NULL, req);
+        ceph_mdsc_put_request(req);
+        dout("do_getattr result=%d\n", err);
+        return err;
+}
+/*
+ * Check inode permissions.  We verify we have a valid value for
+ * the AUTH cap, then call the generic handler.
+ */
+int ceph_permission(struct inode *inode, int mask)
+{
+        int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+        if (!err)
+                err = generic_permission(inode, mask, NULL);
+        return err;
+}
+/*
+ * Get all attributes.  Hopefully somedata we'll have a statlite()
+ * and can limit the fields we require to be accurate.
+ */
+int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                 struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int err;
+        err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
+        if (!err) {
+                generic_fillattr(inode, stat);
+                stat->ino = inode->i_ino;
+                if (ceph_snap(inode) != CEPH_NOSNAP)
+                        stat->dev = ceph_snap(inode);
+                else
+                        stat->dev = 0;
+                if (S_ISDIR(inode->i_mode)) {
+                        stat->size = ci->i_rbytes;
+                        stat->blocks = 0;
+                        stat->blksize = 65536;
+                }
+        }
+        return err;
+}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..8a5bcae62846
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
+#include <linux/in.h>
+#include "ioctl.h"
+#include "super.h"
+#include "ceph_debug.h"
+/*
+ * ioctls
+ */
+/*
+ * get and set the file layout
+ */
+static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
+{
+        struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+        struct ceph_ioctl_layout l;
+        int err;
+        err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+        if (!err) {
+                l.stripe_unit = ceph_file_layout_su(ci->i_layout);
+                l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+                l.object_size = ceph_file_layout_object_size(ci->i_layout);
+                l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+                l.preferred_osd =
+                        (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
+                if (copy_to_user(arg, &l, sizeof(l)))
+                        return -EFAULT;
+        }
+        return err;
+}
+static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_request *req;
+        struct ceph_ioctl_layout l;
+        int err, i;
+        /* copy and validate */
+        if (copy_from_user(&l, arg, sizeof(l)))
+                return -EFAULT;
+        if ((l.object_size & ~PAGE_MASK) ||
+            (l.stripe_unit & ~PAGE_MASK) ||
+            !l.stripe_unit ||
+            (l.object_size &&
+             (unsigned)l.object_size % (unsigned)l.stripe_unit))
+                return -EINVAL;
+        /* make sure it's a valid data pool */
+        if (l.data_pool > 0) {
+                mutex_lock(&mdsc->mutex);
+                err = -EINVAL;
+                for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+                        if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+                                err = 0;
+                                break;
+                        }
+                mutex_unlock(&mdsc->mutex);
+                if (err)
+                        return err;
+        }
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
+                                       USE_AUTH_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->r_inode = igrab(inode);
+        req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
+        req->r_args.setlayout.layout.fl_stripe_unit =
+                cpu_to_le32(l.stripe_unit);
+        req->r_args.setlayout.layout.fl_stripe_count =
+                cpu_to_le32(l.stripe_count);
+        req->r_args.setlayout.layout.fl_object_size =
+                cpu_to_le32(l.object_size);
+        req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
+        req->r_args.setlayout.layout.fl_pg_preferred =
+                cpu_to_le32(l.preferred_osd);
+        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        ceph_mdsc_put_request(req);
+        return err;
+}
+/*
+ * Return object name, size/offset information, and location (OSD
+ * number, network address) for a given file offset.
+ */
+static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
+{
+        struct ceph_ioctl_dataloc dl;
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+        u64 len = 1, olen;
+        u64 tmp;
+        struct ceph_object_layout ol;
+        struct ceph_pg pgid;
+        /* copy and validate */
+        if (copy_from_user(&dl, arg, sizeof(dl)))
+                return -EFAULT;
+        down_read(&osdc->map_sem);
+        ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+                                      &dl.object_no, &dl.object_offset, &olen);
+        dl.file_offset -= dl.object_offset;
+        dl.object_size = ceph_file_layout_object_size(ci->i_layout);
+        dl.block_size = ceph_file_layout_su(ci->i_layout);
+        /* block_offset = object_offset % block_size */
+        tmp = dl.object_offset;
+        dl.block_offset = do_div(tmp, dl.block_size);
+        snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
+                 ceph_ino(inode), dl.object_no);
+        ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
+                                osdc->osdmap);
+        pgid = ol.ol_pgid;
+        dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+        if (dl.osd >= 0) {
+                struct ceph_entity_addr *a =
+                        ceph_osd_addr(osdc->osdmap, dl.osd);
+                if (a)
+                        memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
+        } else {
+                memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
+        }
+        up_read(&osdc->map_sem);
+        /* send result back to user */
+        if (copy_to_user(arg, &dl, sizeof(dl)))
+                return -EFAULT;
+        return 0;
+}
+long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
+        switch (cmd) {
+        case CEPH_IOC_GET_LAYOUT:
+                return ceph_ioctl_get_layout(file, (void __user *)arg);
+        case CEPH_IOC_SET_LAYOUT:
+                return ceph_ioctl_set_layout(file, (void __user *)arg);
+        case CEPH_IOC_GET_DATALOC:
+                return ceph_ioctl_get_dataloc(file, (void __user *)arg);
+        }
+        return -ENOTTY;
+}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..25e4f1a9d059
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
+#ifndef FS_CEPH_IOCTL_H
+#define FS_CEPH_IOCTL_H
+#include <linux/ioctl.h>
+#include <linux/types.h>
+#define CEPH_IOCTL_MAGIC 0x97
+/* just use u64 to align sanely on all archs */
+struct ceph_ioctl_layout {
+        __u64 stripe_unit, stripe_count, object_size;
+        __u64 data_pool;
+        __s64 preferred_osd;
+};
+#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1,           \
+                                   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,           \
+                                   struct ceph_ioctl_layout)
+/*
+ * Extract identity, address of the OSD and object storing a given
+ * file offset.
+ */
+struct ceph_ioctl_dataloc {
+        __u64 file_offset;           /* in+out: file offset */
+        __u64 object_offset;         /* out: offset in object */
+        __u64 object_no;             /* out: object # */
+        __u64 object_size;           /* out: object size */
+        char object_name[64];        /* out: object name */
+        __u64 block_offset;          /* out: offset in block */
+        __u64 block_size;            /* out: block length */
+        __s64 osd;                   /* out: osd # */
+        struct sockaddr_storage osd_addr; /* out: osd address */
+};
+#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
+                                   struct ceph_ioctl_dataloc)
+#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..24561a557e01
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3047 @@
+#include "ceph_debug.h"
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include "mds_client.h"
+#include "mon_client.h"
+#include "super.h"
+#include "messenger.h"
+#include "decode.h"
+#include "auth.h"
+#include "pagelist.h"
+/*
+ * A cluster of MDS (metadata server) daemons is responsible for
+ * managing the file system namespace (the directory hierarchy and
+ * inodes) and for coordinating shared access to storage.  Metadata is
+ * partitioning hierarchically across a number of servers, and that
+ * partition varies over time as the cluster adjusts the distribution
+ * in order to balance load.
+ *
+ * The MDS client is primarily responsible to managing synchronous
+ * metadata requests for operations like open, unlink, and so forth.
+ * If there is a MDS failure, we find out about it when we (possibly
+ * request and) receive a new MDS map, and can resubmit affected
+ * requests.
+ *
+ * For the most part, though, we take advantage of a lossless
+ * communications channel to the MDS, and do not need to worry about
+ * timing out or resubmitting requests.
+ *
+ * We maintain a stateful "session" with each MDS we interact with.
+ * Within each session, we sent periodic heartbeat messages to ensure
+ * any capabilities or leases we have been issues remain valid.  If
+ * the session times out and goes stale, our leases and capabilities
+ * are no longer valid.
+ */
+static void __wake_requests(struct ceph_mds_client *mdsc,
+                            struct list_head *head);
+const static struct ceph_connection_operations mds_con_ops;
+/*
+ * mds reply parsing
+ */
+/*
+ * parse individual inode info
+ */
+static int parse_reply_info_in(void **p, void *end,
+                               struct ceph_mds_reply_info_in *info)
+{
+        int err = -EIO;
+        info->in = *p;
+        *p += sizeof(struct ceph_mds_reply_inode) +
+                sizeof(*info->in->fragtree.splits) *
+                le32_to_cpu(info->in->fragtree.nsplits);
+        ceph_decode_32_safe(p, end, info->symlink_len, bad);
+        ceph_decode_need(p, end, info->symlink_len, bad);
+        info->symlink = *p;
+        *p += info->symlink_len;
+        ceph_decode_32_safe(p, end, info->xattr_len, bad);
+        ceph_decode_need(p, end, info->xattr_len, bad);
+        info->xattr_data = *p;
+        *p += info->xattr_len;
+        return 0;
+bad:
+        return err;
+}
+/*
+ * parse a normal reply, which may contain a (dir+)dentry and/or a
+ * target inode.
+ */
+static int parse_reply_info_trace(void **p, void *end,
+                                  struct ceph_mds_reply_info_parsed *info)
+{
+        int err;
+        if (info->head->is_dentry) {
+                err = parse_reply_info_in(p, end, &info->diri);
+                if (err < 0)
+                        goto out_bad;
+                if (unlikely(*p + sizeof(*info->dirfrag) > end))
+                        goto bad;
+                info->dirfrag = *p;
+                *p += sizeof(*info->dirfrag) +
+                        sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
+                if (unlikely(*p > end))
+                        goto bad;
+                ceph_decode_32_safe(p, end, info->dname_len, bad);
+                ceph_decode_need(p, end, info->dname_len, bad);
+                info->dname = *p;
+                *p += info->dname_len;
+                info->dlease = *p;
+                *p += sizeof(*info->dlease);
+        }
+        if (info->head->is_target) {
+                err = parse_reply_info_in(p, end, &info->targeti);
+                if (err < 0)
+                        goto out_bad;
+        }
+        if (unlikely(*p != end))
+                goto bad;
+        return 0;
+bad:
+        err = -EIO;
+out_bad:
+        pr_err("problem parsing mds trace %d\n", err);
+        return err;
+}
+/*
+ * parse readdir results
+ */
+static int parse_reply_info_dir(void **p, void *end,
+                                struct ceph_mds_reply_info_parsed *info)
+{
+        u32 num, i = 0;
+        int err;
+        info->dir_dir = *p;
+        if (*p + sizeof(*info->dir_dir) > end)
+                goto bad;
+        *p += sizeof(*info->dir_dir) +
+                sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
+        if (*p > end)
+                goto bad;
+        ceph_decode_need(p, end, sizeof(num) + 2, bad);
+        num = ceph_decode_32(p);
+        info->dir_end = ceph_decode_8(p);
+        info->dir_complete = ceph_decode_8(p);
+        if (num == 0)
+                goto done;
+        /* alloc large array */
+        info->dir_nr = num;
+        info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
+                               sizeof(*info->dir_dname) +
+                               sizeof(*info->dir_dname_len) +
+                               sizeof(*info->dir_dlease),
+                               GFP_NOFS);
+        if (info->dir_in == NULL) {
+                err = -ENOMEM;
+                goto out_bad;
+        }
+        info->dir_dname = (void *)(info->dir_in + num);
+        info->dir_dname_len = (void *)(info->dir_dname + num);
+        info->dir_dlease = (void *)(info->dir_dname_len + num);
+        while (num) {
+                /* dentry */
+                ceph_decode_need(p, end, sizeof(u32)*2, bad);
+                info->dir_dname_len[i] = ceph_decode_32(p);
+                ceph_decode_need(p, end, info->dir_dname_len[i], bad);
+                info->dir_dname[i] = *p;
+                *p += info->dir_dname_len[i];
+                dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
+                     info->dir_dname[i]);
+                info->dir_dlease[i] = *p;
+                *p += sizeof(struct ceph_mds_reply_lease);
+                /* inode */
+                err = parse_reply_info_in(p, end, &info->dir_in[i]);
+                if (err < 0)
+                        goto out_bad;
+                i++;
+                num--;
+        }
+done:
+        if (*p != end)
+                goto bad;
+        return 0;
+bad:
+        err = -EIO;
+out_bad:
+        pr_err("problem parsing dir contents %d\n", err);
+        return err;
+}
+/*
+ * parse entire mds reply
+ */
+static int parse_reply_info(struct ceph_msg *msg,
+                            struct ceph_mds_reply_info_parsed *info)
+{
+        void *p, *end;
+        u32 len;
+        int err;
+        info->head = msg->front.iov_base;
+        p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
+        end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
+        /* trace */
+        ceph_decode_32_safe(&p, end, len, bad);
+        if (len > 0) {
+                err = parse_reply_info_trace(&p, p+len, info);
+                if (err < 0)
+                        goto out_bad;
+        }
+        /* dir content */
+        ceph_decode_32_safe(&p, end, len, bad);
+        if (len > 0) {
+                err = parse_reply_info_dir(&p, p+len, info);
+                if (err < 0)
+                        goto out_bad;
+        }
+        /* snap blob */
+        ceph_decode_32_safe(&p, end, len, bad);
+        info->snapblob_len = len;
+        info->snapblob = p;
+        p += len;
+        if (p != end)
+                goto bad;
+        return 0;
+bad:
+        err = -EIO;
+out_bad:
+        pr_err("mds parse_reply err %d\n", err);
+        return err;
+}
+static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
+{
+        kfree(info->dir_in);
+}
+/*
+ * sessions
+ */
+static const char *session_state_name(int s)
+{
+        switch (s) {
+        case CEPH_MDS_SESSION_NEW: return "new";
+        case CEPH_MDS_SESSION_OPENING: return "opening";
+        case CEPH_MDS_SESSION_OPEN: return "open";
+        case CEPH_MDS_SESSION_HUNG: return "hung";
+        case CEPH_MDS_SESSION_CLOSING: return "closing";
+        case CEPH_MDS_SESSION_RESTARTING: return "restarting";
+        case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
+        default: return "???";
+        }
+}
+static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
+{
+        if (atomic_inc_not_zero(&s->s_ref)) {
+                dout("mdsc get_session %p %d -> %d\n", s,
+                     atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
+                return s;
+        } else {
+                dout("mdsc get_session %p 0 -- FAIL", s);
+                return NULL;
+        }
+}
+void ceph_put_mds_session(struct ceph_mds_session *s)
+{
+        dout("mdsc put_session %p %d -> %d\n", s,
+             atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
+        if (atomic_dec_and_test(&s->s_ref)) {
+                if (s->s_authorizer)
+                        s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
+                                s->s_mdsc->client->monc.auth, s->s_authorizer);
+                kfree(s);
+        }
+}
+/*
+ * called under mdsc->mutex
+ */
+struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
+                                                   int mds)
+{
+        struct ceph_mds_session *session;
+        if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
+                return NULL;
+        session = mdsc->sessions[mds];
+        dout("lookup_mds_session %p %d\n", session,
+             atomic_read(&session->s_ref));
+        get_session(session);
+        return session;
+}
+static bool __have_session(struct ceph_mds_client *mdsc, int mds)
+{
+        if (mds >= mdsc->max_sessions)
+                return false;
+        return mdsc->sessions[mds];
+}
+static int __verify_registered_session(struct ceph_mds_client *mdsc,
+                                       struct ceph_mds_session *s)
+{
+        if (s->s_mds >= mdsc->max_sessions ||
+            mdsc->sessions[s->s_mds] != s)
+                return -ENOENT;
+        return 0;
+}
+/*
+ * create+register a new session for given mds.
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
+                                                 int mds)
+{
+        struct ceph_mds_session *s;
+        s = kzalloc(sizeof(*s), GFP_NOFS);
+        if (!s)
+                return ERR_PTR(-ENOMEM);
+        s->s_mdsc = mdsc;
+        s->s_mds = mds;
+        s->s_state = CEPH_MDS_SESSION_NEW;
+        s->s_ttl = 0;
+        s->s_seq = 0;
+        mutex_init(&s->s_mutex);
+        ceph_con_init(mdsc->client->msgr, &s->s_con);
+        s->s_con.private = s;
+        s->s_con.ops = &mds_con_ops;
+        s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
+        s->s_con.peer_name.num = cpu_to_le64(mds);
+        spin_lock_init(&s->s_cap_lock);
+        s->s_cap_gen = 0;
+        s->s_cap_ttl = 0;
+        s->s_renew_requested = 0;
+        s->s_renew_seq = 0;
+        INIT_LIST_HEAD(&s->s_caps);
+        s->s_nr_caps = 0;
+        s->s_trim_caps = 0;
+        atomic_set(&s->s_ref, 1);
+        INIT_LIST_HEAD(&s->s_waiting);
+        INIT_LIST_HEAD(&s->s_unsafe);
+        s->s_num_cap_releases = 0;
+        s->s_cap_iterator = NULL;
+        INIT_LIST_HEAD(&s->s_cap_releases);
+        INIT_LIST_HEAD(&s->s_cap_releases_done);
+        INIT_LIST_HEAD(&s->s_cap_flushing);
+        INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
+        dout("register_session mds%d\n", mds);
+        if (mds >= mdsc->max_sessions) {
+                int newmax = 1 << get_count_order(mds+1);
+                struct ceph_mds_session **sa;
+                dout("register_session realloc to %d\n", newmax);
+                sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
+                if (sa == NULL)
+                        goto fail_realloc;
+                if (mdsc->sessions) {
+                        memcpy(sa, mdsc->sessions,
+                               mdsc->max_sessions * sizeof(void *));
+                        kfree(mdsc->sessions);
+                }
+                mdsc->sessions = sa;
+                mdsc->max_sessions = newmax;
+        }
+        mdsc->sessions[mds] = s;
+        atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
+        ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+        return s;
+fail_realloc:
+        kfree(s);
+        return ERR_PTR(-ENOMEM);
+}
+/*
+ * called under mdsc->mutex
+ */
+static void __unregister_session(struct ceph_mds_client *mdsc,
+                               struct ceph_mds_session *s)
+{
+        dout("__unregister_session mds%d %p\n", s->s_mds, s);
+        BUG_ON(mdsc->sessions[s->s_mds] != s);
+        mdsc->sessions[s->s_mds] = NULL;
+        ceph_con_close(&s->s_con);
+        ceph_put_mds_session(s);
+}
+/*
+ * drop session refs in request.
+ *
+ * should be last request ref, or hold mdsc->mutex
+ */
+static void put_request_session(struct ceph_mds_request *req)
+{
+        if (req->r_session) {
+                ceph_put_mds_session(req->r_session);
+                req->r_session = NULL;
+        }
+}
+void ceph_mdsc_release_request(struct kref *kref)
+{
+        struct ceph_mds_request *req = container_of(kref,
+                                                    struct ceph_mds_request,
+                                                    r_kref);
+        if (req->r_request)
+                ceph_msg_put(req->r_request);
+        if (req->r_reply) {
+                ceph_msg_put(req->r_reply);
+                destroy_reply_info(&req->r_reply_info);
+        }
+        if (req->r_inode) {
+                ceph_put_cap_refs(ceph_inode(req->r_inode),
+                                  CEPH_CAP_PIN);
+                iput(req->r_inode);
+        }
+        if (req->r_locked_dir)
+                ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
+                                  CEPH_CAP_PIN);
+        if (req->r_target_inode)
+                iput(req->r_target_inode);
+        if (req->r_dentry)
+                dput(req->r_dentry);
+        if (req->r_old_dentry) {
+                ceph_put_cap_refs(
+                        ceph_inode(req->r_old_dentry->d_parent->d_inode),
+                        CEPH_CAP_PIN);
+                dput(req->r_old_dentry);
+        }
+        kfree(req->r_path1);
+        kfree(req->r_path2);
+        put_request_session(req);
+        ceph_unreserve_caps(&req->r_caps_reservation);
+        kfree(req);
+}
+/*
+ * lookup session, bump ref if found.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
+                                             u64 tid)
+{
+        struct ceph_mds_request *req;
+        struct rb_node *n = mdsc->request_tree.rb_node;
+        while (n) {
+                req = rb_entry(n, struct ceph_mds_request, r_node);
+                if (tid < req->r_tid)
+                        n = n->rb_left;
+                else if (tid > req->r_tid)
+                        n = n->rb_right;
+                else {
+                        ceph_mdsc_get_request(req);
+                        return req;
+                }
+        }
+        return NULL;
+}
+static void __insert_request(struct ceph_mds_client *mdsc,
+                             struct ceph_mds_request *new)
+{
+        struct rb_node **p = &mdsc->request_tree.rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_mds_request *req = NULL;
+        while (*p) {
+                parent = *p;
+                req = rb_entry(parent, struct ceph_mds_request, r_node);
+                if (new->r_tid < req->r_tid)
+                        p = &(*p)->rb_left;
+                else if (new->r_tid > req->r_tid)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&new->r_node, parent, p);
+        rb_insert_color(&new->r_node, &mdsc->request_tree);
+}
+/*
+ * Register an in-flight request, and assign a tid.  Link to directory
+ * are modifying (if any).
+ *
+ * Called under mdsc->mutex.
+ */
+static void __register_request(struct ceph_mds_client *mdsc,
+                               struct ceph_mds_request *req,
+                               struct inode *dir)
+{
+        req->r_tid = ++mdsc->last_tid;
+        if (req->r_num_caps)
+                ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
+        dout("__register_request %p tid %lld\n", req, req->r_tid);
+        ceph_mdsc_get_request(req);
+        __insert_request(mdsc, req);
+        if (dir) {
+                struct ceph_inode_info *ci = ceph_inode(dir);
+                spin_lock(&ci->i_unsafe_lock);
+                req->r_unsafe_dir = dir;
+                list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+                spin_unlock(&ci->i_unsafe_lock);
+        }
+}
+static void __unregister_request(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_request *req)
+{
+        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+        rb_erase(&req->r_node, &mdsc->request_tree);
+        RB_CLEAR_NODE(&req->r_node);
+        if (req->r_unsafe_dir) {
+                struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
+                spin_lock(&ci->i_unsafe_lock);
+                list_del_init(&req->r_unsafe_dir_item);
+                spin_unlock(&ci->i_unsafe_lock);
+        }
+        ceph_mdsc_put_request(req);
+}
+/*
+ * Choose mds to send request to next.  If there is a hint set in the
+ * request (e.g., due to a prior forward hint from the mds), use that.
+ * Otherwise, consult frag tree and/or caps to identify the
+ * appropriate mds.  If all else fails, choose randomly.
+ *
+ * Called under mdsc->mutex.
+ */
+static int __choose_mds(struct ceph_mds_client *mdsc,
+                        struct ceph_mds_request *req)
+{
+        struct inode *inode;
+        struct ceph_inode_info *ci;
+        struct ceph_cap *cap;
+        int mode = req->r_direct_mode;
+        int mds = -1;
+        u32 hash = req->r_direct_hash;
+        bool is_hash = req->r_direct_is_hash;
+        /*
+         * is there a specific mds we should try?  ignore hint if we have
+         * no session and the mds is not up (active or recovering).
+         */
+        if (req->r_resend_mds >= 0 &&
+            (__have_session(mdsc, req->r_resend_mds) ||
+             ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
+                dout("choose_mds using resend_mds mds%d\n",
+                     req->r_resend_mds);
+                return req->r_resend_mds;
+        }
+        if (mode == USE_RANDOM_MDS)
+                goto random;
+        inode = NULL;
+        if (req->r_inode) {
+                inode = req->r_inode;
+        } else if (req->r_dentry) {
+                if (req->r_dentry->d_inode) {
+                        inode = req->r_dentry->d_inode;
+                } else {
+                        inode = req->r_dentry->d_parent->d_inode;
+                        hash = req->r_dentry->d_name.hash;
+                        is_hash = true;
+                }
+        }
+        dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
+             (int)hash, mode);
+        if (!inode)
+                goto random;
+        ci = ceph_inode(inode);
+        if (is_hash && S_ISDIR(inode->i_mode)) {
+                struct ceph_inode_frag frag;
+                int found;
+                ceph_choose_frag(ci, hash, &frag, &found);
+                if (found) {
+                        if (mode == USE_ANY_MDS && frag.ndist > 0) {
+                                u8 r;
+                                /* choose a random replica */
+                                get_random_bytes(&r, 1);
+                                r %= frag.ndist;
+                                mds = frag.dist[r];
+                                dout("choose_mds %p %llx.%llx "
+                                     "frag %u mds%d (%d/%d)\n",
+                                     inode, ceph_vinop(inode),
+                                     frag.frag, frag.mds,
+                                     (int)r, frag.ndist);
+                                return mds;
+                        }
+                        /* since this file/dir wasn't known to be
+                         * replicated, then we want to look for the
+                         * authoritative mds. */
+                        mode = USE_AUTH_MDS;
+                        if (frag.mds >= 0) {
+                                /* choose auth mds */
+                                mds = frag.mds;
+                                dout("choose_mds %p %llx.%llx "
+                                     "frag %u mds%d (auth)\n",
+                                     inode, ceph_vinop(inode), frag.frag, mds);
+                                return mds;
+                        }
+                }
+        }
+        spin_lock(&inode->i_lock);
+        cap = NULL;
+        if (mode == USE_AUTH_MDS)
+                cap = ci->i_auth_cap;
+        if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
+                cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
+        if (!cap) {
+                spin_unlock(&inode->i_lock);
+                goto random;
+        }
+        mds = cap->session->s_mds;
+        dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
+             inode, ceph_vinop(inode), mds,
+             cap == ci->i_auth_cap ? "auth " : "", cap);
+        spin_unlock(&inode->i_lock);
+        return mds;
+random:
+        mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
+        dout("choose_mds chose random mds%d\n", mds);
+        return mds;
+}
+/*
+ * session messages
+ */
+static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_session_head *h;
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
+        if (IS_ERR(msg)) {
+                pr_err("create_session_msg ENOMEM creating msg\n");
+                return ERR_PTR(PTR_ERR(msg));
+        }
+        h = msg->front.iov_base;
+        h->op = cpu_to_le32(op);
+        h->seq = cpu_to_le64(seq);
+        return msg;
+}
+/*
+ * send session open request.
+ *
+ * called under mdsc->mutex
+ */
+static int __open_session(struct ceph_mds_client *mdsc,
+                          struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        int mstate;
+        int mds = session->s_mds;
+        int err = 0;
+        /* wait for mds to go active? */
+        mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
+        dout("open_session to mds%d (%s)\n", mds,
+             ceph_mds_state_name(mstate));
+        session->s_state = CEPH_MDS_SESSION_OPENING;
+        session->s_renew_requested = jiffies;
+        /* send connect message */
+        msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
+        if (IS_ERR(msg)) {
+                err = PTR_ERR(msg);
+                goto out;
+        }
+        ceph_con_send(&session->s_con, msg);
+out:
+        return 0;
+}
+/*
+ * session caps
+ */
+/*
+ * Free preallocated cap messages assigned to this session
+ */
+static void cleanup_cap_releases(struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        spin_lock(&session->s_cap_lock);
+        while (!list_empty(&session->s_cap_releases)) {
+                msg = list_first_entry(&session->s_cap_releases,
+                                       struct ceph_msg, list_head);
+                list_del_init(&msg->list_head);
+                ceph_msg_put(msg);
+        }
+        while (!list_empty(&session->s_cap_releases_done)) {
+                msg = list_first_entry(&session->s_cap_releases_done,
+                                       struct ceph_msg, list_head);
+                list_del_init(&msg->list_head);
+                ceph_msg_put(msg);
+        }
+        spin_unlock(&session->s_cap_lock);
+}
+/*
+ * Helper to safely iterate over all caps associated with a session, with
+ * special care taken to handle a racing __ceph_remove_cap().
+ *
+ * Caller must hold session s_mutex.
+ */
+static int iterate_session_caps(struct ceph_mds_session *session,
+                                 int (*cb)(struct inode *, struct ceph_cap *,
+                                            void *), void *arg)
+{
+        struct list_head *p;
+        struct ceph_cap *cap;
+        struct inode *inode, *last_inode = NULL;
+        struct ceph_cap *old_cap = NULL;
+        int ret;
+        dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+        spin_lock(&session->s_cap_lock);
+        p = session->s_caps.next;
+        while (p != &session->s_caps) {
+                cap = list_entry(p, struct ceph_cap, session_caps);
+                inode = igrab(&cap->ci->vfs_inode);
+                if (!inode) {
+                        p = p->next;
+                        continue;
+                }
+                session->s_cap_iterator = cap;
+                spin_unlock(&session->s_cap_lock);
+                if (last_inode) {
+                        iput(last_inode);
+                        last_inode = NULL;
+                }
+                if (old_cap) {
+                        ceph_put_cap(old_cap);
+                        old_cap = NULL;
+                }
+                ret = cb(inode, cap, arg);
+                last_inode = inode;
+                spin_lock(&session->s_cap_lock);
+                p = p->next;
+                if (cap->ci == NULL) {
+                        dout("iterate_session_caps  finishing cap %p removal\n",
+                             cap);
+                        BUG_ON(cap->session != session);
+                        list_del_init(&cap->session_caps);
+                        session->s_nr_caps--;
+                        cap->session = NULL;
+                        old_cap = cap;  /* put_cap it w/o locks held */
+                }
+                if (ret < 0)
+                        goto out;
+        }
+        ret = 0;
+out:
+        session->s_cap_iterator = NULL;
+        spin_unlock(&session->s_cap_lock);
+        if (last_inode)
+                iput(last_inode);
+        if (old_cap)
+                ceph_put_cap(old_cap);
+        return ret;
+}
+static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
+                                   void *arg)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        dout("removing cap %p, ci is %p, inode is %p\n",
+             cap, ci, &ci->vfs_inode);
+        ceph_remove_cap(cap);
+        return 0;
+}
+/*
+ * caller must hold session s_mutex
+ */
+static void remove_session_caps(struct ceph_mds_session *session)
+{
+        dout("remove_session_caps on %p\n", session);
+        iterate_session_caps(session, remove_session_caps_cb, NULL);
+        BUG_ON(session->s_nr_caps > 0);
+        cleanup_cap_releases(session);
+}
+/*
+ * wake up any threads waiting on this session's caps.  if the cap is
+ * old (didn't get renewed on the client reconnect), remove it now.
+ *
+ * caller must hold s_mutex.
+ */
+static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
+                              void *arg)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        wake_up(&ci->i_cap_wq);
+        if (arg) {
+                spin_lock(&inode->i_lock);
+                ci->i_wanted_max_size = 0;
+                ci->i_requested_max_size = 0;
+                spin_unlock(&inode->i_lock);
+        }
+        return 0;
+}
+static void wake_up_session_caps(struct ceph_mds_session *session,
+                                 int reconnect)
+{
+        dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
+        iterate_session_caps(session, wake_up_session_cb,
+                             (void *)(unsigned long)reconnect);
+}
+/*
+ * Send periodic message to MDS renewing all currently held caps.  The
+ * ack will reset the expiration for all caps from this session.
+ *
+ * caller holds s_mutex
+ */
+static int send_renew_caps(struct ceph_mds_client *mdsc,
+                           struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        int state;
+        if (time_after_eq(jiffies, session->s_cap_ttl) &&
+            time_after_eq(session->s_cap_ttl, session->s_renew_requested))
+                pr_info("mds%d caps stale\n", session->s_mds);
+        session->s_renew_requested = jiffies;
+        /* do not try to renew caps until a recovering mds has reconnected
+         * with its clients. */
+        state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
+        if (state < CEPH_MDS_STATE_RECONNECT) {
+                dout("send_renew_caps ignoring mds%d (%s)\n",
+                     session->s_mds, ceph_mds_state_name(state));
+                return 0;
+        }
+        dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
+                ceph_mds_state_name(state));
+        msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+                                 ++session->s_renew_seq);
+        if (IS_ERR(msg))
+                return PTR_ERR(msg);
+        ceph_con_send(&session->s_con, msg);
+        return 0;
+}
+/*
+ * Note new cap ttl, and any transition from stale -> not stale (fresh?).
+ *
+ * Called under session->s_mutex
+ */
+static void renewed_caps(struct ceph_mds_client *mdsc,
+                         struct ceph_mds_session *session, int is_renew)
+{
+        int was_stale;
+        int wake = 0;
+        spin_lock(&session->s_cap_lock);
+        was_stale = is_renew && (session->s_cap_ttl == 0 ||
+                                 time_after_eq(jiffies, session->s_cap_ttl));
+        session->s_cap_ttl = session->s_renew_requested +
+                mdsc->mdsmap->m_session_timeout*HZ;
+        if (was_stale) {
+                if (time_before(jiffies, session->s_cap_ttl)) {
+                        pr_info("mds%d caps renewed\n", session->s_mds);
+                        wake = 1;
+                } else {
+                        pr_info("mds%d caps still stale\n", session->s_mds);
+                }
+        }
+        dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
+             session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
+             time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+        spin_unlock(&session->s_cap_lock);
+        if (wake)
+                wake_up_session_caps(session, 0);
+}
+/*
+ * send a session close request
+ */
+static int request_close_session(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        int err = 0;
+        dout("request_close_session mds%d state %s seq %lld\n",
+             session->s_mds, session_state_name(session->s_state),
+             session->s_seq);
+        msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+        if (IS_ERR(msg))
+                err = PTR_ERR(msg);
+        else
+                ceph_con_send(&session->s_con, msg);
+        return err;
+}
+/*
+ * Called with s_mutex held.
+ */
+static int __close_session(struct ceph_mds_client *mdsc,
+                         struct ceph_mds_session *session)
+{
+        if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
+                return 0;
+        session->s_state = CEPH_MDS_SESSION_CLOSING;
+        return request_close_session(mdsc, session);
+}
+/*
+ * Trim old(er) caps.
+ *
+ * Because we can't cache an inode without one or more caps, we do
+ * this indirectly: if a cap is unused, we prune its aliases, at which
+ * point the inode will hopefully get dropped to.
+ *
+ * Yes, this is a bit sloppy.  Our only real goal here is to respond to
+ * memory pressure from the MDS, though, so it needn't be perfect.
+ */
+static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+{
+        struct ceph_mds_session *session = arg;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int used, oissued, mine;
+        if (session->s_trim_caps <= 0)
+                return -1;
+        spin_lock(&inode->i_lock);
+        mine = cap->issued | cap->implemented;
+        used = __ceph_caps_used(ci);
+        oissued = __ceph_caps_issued_other(ci, cap);
+        dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
+             inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
+             ceph_cap_string(used));
+        if (ci->i_dirty_caps)
+                goto out;   /* dirty caps */
+        if ((used & ~oissued) & mine)
+                goto out;   /* we need these caps */
+        session->s_trim_caps--;
+        if (oissued) {
+                /* we aren't the only cap.. just remove us */
+                __ceph_remove_cap(cap);
+        } else {
+                /* try to drop referring dentries */
+                spin_unlock(&inode->i_lock);
+                d_prune_aliases(inode);
+                dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
+                     inode, cap, atomic_read(&inode->i_count));
+                return 0;
+        }
+out:
+        spin_unlock(&inode->i_lock);
+        return 0;
+}
+/*
+ * Trim session cap count down to some max number.
+ */
+static int trim_caps(struct ceph_mds_client *mdsc,
+                     struct ceph_mds_session *session,
+                     int max_caps)
+{
+        int trim_caps = session->s_nr_caps - max_caps;
+        dout("trim_caps mds%d start: %d / %d, trim %d\n",
+             session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+        if (trim_caps > 0) {
+                session->s_trim_caps = trim_caps;
+                iterate_session_caps(session, trim_caps_cb, session);
+                dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
+                     session->s_mds, session->s_nr_caps, max_caps,
+                        trim_caps - session->s_trim_caps);
+                session->s_trim_caps = 0;
+        }
+        return 0;
+}
+/*
+ * Allocate cap_release messages.  If there is a partially full message
+ * in the queue, try to allocate enough to cover it's remainder, so that
+ * we can send it immediately.
+ *
+ * Called under s_mutex.
+ */
+static int add_cap_releases(struct ceph_mds_client *mdsc,
+                            struct ceph_mds_session *session,
+                            int extra)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_cap_release *head;
+        int err = -ENOMEM;
+        if (extra < 0)
+                extra = mdsc->client->mount_args->cap_release_safety;
+        spin_lock(&session->s_cap_lock);
+        if (!list_empty(&session->s_cap_releases)) {
+                msg = list_first_entry(&session->s_cap_releases,
+                                       struct ceph_msg,
+                                 list_head);
+                head = msg->front.iov_base;
+                extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+        }
+        while (session->s_num_cap_releases < session->s_nr_caps + extra) {
+                spin_unlock(&session->s_cap_lock);
+                msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
+                                   0, 0, NULL);
+                if (!msg)
+                        goto out_unlocked;
+                dout("add_cap_releases %p msg %p now %d\n", session, msg,
+                     (int)msg->front.iov_len);
+                head = msg->front.iov_base;
+                head->num = cpu_to_le32(0);
+                msg->front.iov_len = sizeof(*head);
+                spin_lock(&session->s_cap_lock);
+                list_add(&msg->list_head, &session->s_cap_releases);
+                session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
+        }
+        if (!list_empty(&session->s_cap_releases)) {
+                msg = list_first_entry(&session->s_cap_releases,
+                                       struct ceph_msg,
+                                       list_head);
+                head = msg->front.iov_base;
+                if (head->num) {
+                        dout(" queueing non-full %p (%d)\n", msg,
+                             le32_to_cpu(head->num));
+                        list_move_tail(&msg->list_head,
+                                      &session->s_cap_releases_done);
+                        session->s_num_cap_releases -=
+                                CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+                }
+        }
+        err = 0;
+        spin_unlock(&session->s_cap_lock);
+out_unlocked:
+        return err;
+}
+/*
+ * flush all dirty inode data to disk.
+ *
+ * returns true if we've flushed through want_flush_seq
+ */
+static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+{
+        int mds, ret = 1;
+        dout("check_cap_flush want %lld\n", want_flush_seq);
+        mutex_lock(&mdsc->mutex);
+        for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+                struct ceph_mds_session *session = mdsc->sessions[mds];
+                if (!session)
+                        continue;
+                get_session(session);
+                mutex_unlock(&mdsc->mutex);
+                mutex_lock(&session->s_mutex);
+                if (!list_empty(&session->s_cap_flushing)) {
+                        struct ceph_inode_info *ci =
+                                list_entry(session->s_cap_flushing.next,
+                                           struct ceph_inode_info,
+                                           i_flushing_item);
+                        struct inode *inode = &ci->vfs_inode;
+                        spin_lock(&inode->i_lock);
+                        if (ci->i_cap_flush_seq <= want_flush_seq) {
+                                dout("check_cap_flush still flushing %p "
+                                     "seq %lld <= %lld to mds%d\n", inode,
+                                     ci->i_cap_flush_seq, want_flush_seq,
+                                     session->s_mds);
+                                ret = 0;
+                        }
+                        spin_unlock(&inode->i_lock);
+                }
+                mutex_unlock(&session->s_mutex);
+                ceph_put_mds_session(session);
+                if (!ret)
+                        return ret;
+                mutex_lock(&mdsc->mutex);
+        }
+        mutex_unlock(&mdsc->mutex);
+        dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+        return ret;
+}
+/*
+ * called under s_mutex
+ */
+static void send_cap_releases(struct ceph_mds_client *mdsc,
+                       struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        dout("send_cap_releases mds%d\n", session->s_mds);
+        while (1) {
+                spin_lock(&session->s_cap_lock);
+                if (list_empty(&session->s_cap_releases_done))
+                        break;
+                msg = list_first_entry(&session->s_cap_releases_done,
+                                 struct ceph_msg, list_head);
+                list_del_init(&msg->list_head);
+                spin_unlock(&session->s_cap_lock);
+                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+                dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+                ceph_con_send(&session->s_con, msg);
+        }
+        spin_unlock(&session->s_cap_lock);
+}
+/*
+ * requests
+ */
+/*
+ * Create an mds request.
+ */
+struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
+{
+        struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+        if (!req)
+                return ERR_PTR(-ENOMEM);
+        req->r_started = jiffies;
+        req->r_resend_mds = -1;
+        INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+        req->r_fmode = -1;
+        kref_init(&req->r_kref);
+        INIT_LIST_HEAD(&req->r_wait);
+        init_completion(&req->r_completion);
+        init_completion(&req->r_safe_completion);
+        INIT_LIST_HEAD(&req->r_unsafe_item);
+        req->r_op = op;
+        req->r_direct_mode = mode;
+        return req;
+}
+/*
+ * return oldest (lowest) request, tid in request tree, 0 if none.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
+{
+        if (RB_EMPTY_ROOT(&mdsc->request_tree))
+                return NULL;
+        return rb_entry(rb_first(&mdsc->request_tree),
+                        struct ceph_mds_request, r_node);
+}
+static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+{
+        struct ceph_mds_request *req = __get_oldest_req(mdsc);
+        if (req)
+                return req->r_tid;
+        return 0;
+}
+/*
+ * Build a dentry's path.  Allocate on heap; caller must kfree.  Based
+ * on build_path_from_dentry in fs/cifs/dir.c.
+ *
+ * If @stop_on_nosnap, generate path relative to the first non-snapped
+ * inode.
+ *
+ * Encode hidden .snap dirs as a double /, i.e.
+ *   foo/.snap/bar -> foo//bar
+ */
+char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+                           int stop_on_nosnap)
+{
+        struct dentry *temp;
+        char *path;
+        int len, pos;
+        if (dentry == NULL)
+                return ERR_PTR(-EINVAL);
+retry:
+        len = 0;
+        for (temp = dentry; !IS_ROOT(temp);) {
+                struct inode *inode = temp->d_inode;
+                if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
+                        len++;  /* slash only */
+                else if (stop_on_nosnap && inode &&
+                         ceph_snap(inode) == CEPH_NOSNAP)
+                        break;
+                else
+                        len += 1 + temp->d_name.len;
+                temp = temp->d_parent;
+                if (temp == NULL) {
+                        pr_err("build_path_dentry corrupt dentry %p\n", dentry);
+                        return ERR_PTR(-EINVAL);
+                }
+        }
+        if (len)
+                len--;  /* no leading '/' */
+        path = kmalloc(len+1, GFP_NOFS);
+        if (path == NULL)
+                return ERR_PTR(-ENOMEM);
+        pos = len;
+        path[pos] = 0;  /* trailing null */
+        for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
+                struct inode *inode = temp->d_inode;
+                if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
+                        dout("build_path_dentry path+%d: %p SNAPDIR\n",
+                             pos, temp);
+                } else if (stop_on_nosnap && inode &&
+                           ceph_snap(inode) == CEPH_NOSNAP) {
+                        break;
+                } else {
+                        pos -= temp->d_name.len;
+                        if (pos < 0)
+                                break;
+                        strncpy(path + pos, temp->d_name.name,
+                                temp->d_name.len);
+                        dout("build_path_dentry path+%d: %p '%.*s'\n",
+                             pos, temp, temp->d_name.len, path + pos);
+                }
+                if (pos)
+                        path[--pos] = '/';
+                temp = temp->d_parent;
+                if (temp == NULL) {
+                        pr_err("build_path_dentry corrupt dentry\n");
+                        kfree(path);
+                        return ERR_PTR(-EINVAL);
+                }
+        }
+        if (pos != 0) {
+                pr_err("build_path_dentry did not end path lookup where "
+                       "expected, namelen is %d, pos is %d\n", len, pos);
+                /* presumably this is only possible if racing with a
+                   rename of one of the parent directories (we can not
+                   lock the dentries above us to prevent this, but
+                   retrying should be harmless) */
+                kfree(path);
+                goto retry;
+        }
+        *base = ceph_ino(temp->d_inode);
+        *plen = len;
+        dout("build_path_dentry on %p %d built %llx '%.*s'\n",
+             dentry, atomic_read(&dentry->d_count), *base, len, path);
+        return path;
+}
+static int build_dentry_path(struct dentry *dentry,
+                             const char **ppath, int *ppathlen, u64 *pino,
+                             int *pfreepath)
+{
+        char *path;
+        if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
+                *pino = ceph_ino(dentry->d_parent->d_inode);
+                *ppath = dentry->d_name.name;
+                *ppathlen = dentry->d_name.len;
+                return 0;
+        }
+        path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        *ppath = path;
+        *pfreepath = 1;
+        return 0;
+}
+static int build_inode_path(struct inode *inode,
+                            const char **ppath, int *ppathlen, u64 *pino,
+                            int *pfreepath)
+{
+        struct dentry *dentry;
+        char *path;
+        if (ceph_snap(inode) == CEPH_NOSNAP) {
+                *pino = ceph_ino(inode);
+                *ppathlen = 0;
+                return 0;
+        }
+        dentry = d_find_alias(inode);
+        path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+        dput(dentry);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        *ppath = path;
+        *pfreepath = 1;
+        return 0;
+}
+/*
+ * request arguments may be specified via an inode *, a dentry *, or
+ * an explicit ino+path.
+ */
+static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
+                                  const char *rpath, u64 rino,
+                                  const char **ppath, int *pathlen,
+                                  u64 *ino, int *freepath)
+{
+        int r = 0;
+        if (rinode) {
+                r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+                dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+                     ceph_snap(rinode));
+        } else if (rdentry) {
+                r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
+                dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
+                     *ppath);
+        } else if (rpath) {
+                *ino = rino;
+                *ppath = rpath;
+                *pathlen = strlen(rpath);
+                dout(" path %.*s\n", *pathlen, rpath);
+        }
+        return r;
+}
+/*
+ * called under mdsc->mutex
+ */
+static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+                                               struct ceph_mds_request *req,
+                                               int mds)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_request_head *head;
+        const char *path1 = NULL;
+        const char *path2 = NULL;
+        u64 ino1 = 0, ino2 = 0;
+        int pathlen1 = 0, pathlen2 = 0;
+        int freepath1 = 0, freepath2 = 0;
+        int len;
+        u16 releases;
+        void *p, *end;
+        int ret;
+        ret = set_request_path_attr(req->r_inode, req->r_dentry,
+                              req->r_path1, req->r_ino1.ino,
+                              &path1, &pathlen1, &ino1, &freepath1);
+        if (ret < 0) {
+                msg = ERR_PTR(ret);
+                goto out;
+        }
+        ret = set_request_path_attr(NULL, req->r_old_dentry,
+                              req->r_path2, req->r_ino2.ino,
+                              &path2, &pathlen2, &ino2, &freepath2);
+        if (ret < 0) {
+                msg = ERR_PTR(ret);
+                goto out_free1;
+        }
+        len = sizeof(*head) +
+                pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
+        /* calculate (max) length for cap releases */
+        len += sizeof(struct ceph_mds_request_release) *
+                (!!req->r_inode_drop + !!req->r_dentry_drop +
+                 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+        if (req->r_dentry_drop)
+                len += req->r_dentry->d_name.len;
+        if (req->r_old_dentry_drop)
+                len += req->r_old_dentry->d_name.len;
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
+        if (IS_ERR(msg))
+                goto out_free2;
+        msg->hdr.tid = cpu_to_le64(req->r_tid);
+        head = msg->front.iov_base;
+        p = msg->front.iov_base + sizeof(*head);
+        end = msg->front.iov_base + msg->front.iov_len;
+        head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+        head->op = cpu_to_le32(req->r_op);
+        head->caller_uid = cpu_to_le32(current_fsuid());
+        head->caller_gid = cpu_to_le32(current_fsgid());
+        head->args = req->r_args;
+        ceph_encode_filepath(&p, end, ino1, path1);
+        ceph_encode_filepath(&p, end, ino2, path2);
+        /* cap releases */
+        releases = 0;
+        if (req->r_inode_drop)
+                releases += ceph_encode_inode_release(&p,
+                      req->r_inode ? req->r_inode : req->r_dentry->d_inode,
+                      mds, req->r_inode_drop, req->r_inode_unless, 0);
+        if (req->r_dentry_drop)
+                releases += ceph_encode_dentry_release(&p, req->r_dentry,
+                       mds, req->r_dentry_drop, req->r_dentry_unless);
+        if (req->r_old_dentry_drop)
+                releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
+                       mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
+        if (req->r_old_inode_drop)
+                releases += ceph_encode_inode_release(&p,
+                      req->r_old_dentry->d_inode,
+                      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
+        head->num_releases = cpu_to_le16(releases);
+        BUG_ON(p > end);
+        msg->front.iov_len = p - msg->front.iov_base;
+        msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+        msg->pages = req->r_pages;
+        msg->nr_pages = req->r_num_pages;
+        msg->hdr.data_len = cpu_to_le32(req->r_data_len);
+        msg->hdr.data_off = cpu_to_le16(0);
+out_free2:
+        if (freepath2)
+                kfree((char *)path2);
+out_free1:
+        if (freepath1)
+                kfree((char *)path1);
+out:
+        return msg;
+}
+/*
+ * called under mdsc->mutex if error, under no mutex if
+ * success.
+ */
+static void complete_request(struct ceph_mds_client *mdsc,
+                             struct ceph_mds_request *req)
+{
+        if (req->r_callback)
+                req->r_callback(mdsc, req);
+        else
+                complete(&req->r_completion);
+}
+/*
+ * called under mdsc->mutex
+ */
+static int __prepare_send_request(struct ceph_mds_client *mdsc,
+                                  struct ceph_mds_request *req,
+                                  int mds)
+{
+        struct ceph_mds_request_head *rhead;
+        struct ceph_msg *msg;
+        int flags = 0;
+        req->r_mds = mds;
+        req->r_attempts++;
+        dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
+             req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+        if (req->r_request) {
+                ceph_msg_put(req->r_request);
+                req->r_request = NULL;
+        }
+        msg = create_request_message(mdsc, req, mds);
+        if (IS_ERR(msg)) {
+                req->r_reply = ERR_PTR(PTR_ERR(msg));
+                complete_request(mdsc, req);
+                return -PTR_ERR(msg);
+        }
+        req->r_request = msg;
+        rhead = msg->front.iov_base;
+        rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+        if (req->r_got_unsafe)
+                flags |= CEPH_MDS_FLAG_REPLAY;
+        if (req->r_locked_dir)
+                flags |= CEPH_MDS_FLAG_WANT_DENTRY;
+        rhead->flags = cpu_to_le32(flags);
+        rhead->num_fwd = req->r_num_fwd;
+        rhead->num_retry = req->r_attempts - 1;
+        dout(" r_locked_dir = %p\n", req->r_locked_dir);
+        if (req->r_target_inode && req->r_got_unsafe)
+                rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+        else
+                rhead->ino = 0;
+        return 0;
+}
+/*
+ * send request, or put it on the appropriate wait list.
+ */
+static int __do_request(struct ceph_mds_client *mdsc,
+                        struct ceph_mds_request *req)
+{
+        struct ceph_mds_session *session = NULL;
+        int mds = -1;
+        int err = -EAGAIN;
+        if (req->r_reply)
+                goto out;
+        if (req->r_timeout &&
+            time_after_eq(jiffies, req->r_started + req->r_timeout)) {
+                dout("do_request timed out\n");
+                err = -EIO;
+                goto finish;
+        }
+        mds = __choose_mds(mdsc, req);
+        if (mds < 0 ||
+            ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+                dout("do_request no mds or not active, waiting for map\n");
+                list_add(&req->r_wait, &mdsc->waiting_for_map);
+                goto out;
+        }
+        /* get, open session */
+        session = __ceph_lookup_mds_session(mdsc, mds);
+        if (!session) {
+                session = register_session(mdsc, mds);
+                if (IS_ERR(session)) {
+                        err = PTR_ERR(session);
+                        goto finish;
+                }
+        }
+        dout("do_request mds%d session %p state %s\n", mds, session,
+             session_state_name(session->s_state));
+        if (session->s_state != CEPH_MDS_SESSION_OPEN &&
+            session->s_state != CEPH_MDS_SESSION_HUNG) {
+                if (session->s_state == CEPH_MDS_SESSION_NEW ||
+                    session->s_state == CEPH_MDS_SESSION_CLOSING)
+                        __open_session(mdsc, session);
+                list_add(&req->r_wait, &session->s_waiting);
+                goto out_session;
+        }
+        /* send request */
+        req->r_session = get_session(session);
+        req->r_resend_mds = -1;   /* forget any previous mds hint */
+        if (req->r_request_started == 0)   /* note request start time */
+                req->r_request_started = jiffies;
+        err = __prepare_send_request(mdsc, req, mds);
+        if (!err) {
+                ceph_msg_get(req->r_request);
+                ceph_con_send(&session->s_con, req->r_request);
+        }
+out_session:
+        ceph_put_mds_session(session);
+out:
+        return err;
+finish:
+        req->r_reply = ERR_PTR(err);
+        complete_request(mdsc, req);
+        goto out;
+}
+/*
+ * called under mdsc->mutex
+ */
+static void __wake_requests(struct ceph_mds_client *mdsc,
+                            struct list_head *head)
+{
+        struct ceph_mds_request *req, *nreq;
+        list_for_each_entry_safe(req, nreq, head, r_wait) {
+                list_del_init(&req->r_wait);
+                __do_request(mdsc, req);
+        }
+}
+/*
+ * Wake up threads with requests pending for @mds, so that they can
+ * resubmit their requests to a possibly different mds.  If @all is set,
+ * wake up if their requests has been forwarded to @mds, too.
+ */
+static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
+{
+        struct ceph_mds_request *req;
+        struct rb_node *p;
+        dout("kick_requests mds%d\n", mds);
+        for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
+                req = rb_entry(p, struct ceph_mds_request, r_node);
+                if (req->r_got_unsafe)
+                        continue;
+                if (req->r_session &&
+                    req->r_session->s_mds == mds) {
+                        dout(" kicking tid %llu\n", req->r_tid);
+                        put_request_session(req);
+                        __do_request(mdsc, req);
+                }
+        }
+}
+void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+                              struct ceph_mds_request *req)
+{
+        dout("submit_request on %p\n", req);
+        mutex_lock(&mdsc->mutex);
+        __register_request(mdsc, req, NULL);
+        __do_request(mdsc, req);
+        mutex_unlock(&mdsc->mutex);
+}
+/*
+ * Synchrously perform an mds request.  Take care of all of the
+ * session setup, forwarding, retry details.
+ */
+int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+                         struct inode *dir,
+                         struct ceph_mds_request *req)
+{
+        int err;
+        dout("do_request on %p\n", req);
+        /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
+        if (req->r_inode)
+                ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
+        if (req->r_locked_dir)
+                ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+        if (req->r_old_dentry)
+                ceph_get_cap_refs(
+                        ceph_inode(req->r_old_dentry->d_parent->d_inode),
+                        CEPH_CAP_PIN);
+        /* issue */
+        mutex_lock(&mdsc->mutex);
+        __register_request(mdsc, req, dir);
+        __do_request(mdsc, req);
+        /* wait */
+        if (!req->r_reply) {
+                mutex_unlock(&mdsc->mutex);
+                if (req->r_timeout) {
+                        err = (long)wait_for_completion_interruptible_timeout(
+                                &req->r_completion, req->r_timeout);
+                        if (err == 0)
+                                req->r_reply = ERR_PTR(-EIO);
+                        else if (err < 0)
+                                req->r_reply = ERR_PTR(err);
+                } else {
+                        err = wait_for_completion_interruptible(
+                                &req->r_completion);
+                        if (err)
+                                req->r_reply = ERR_PTR(err);
+                }
+                mutex_lock(&mdsc->mutex);
+        }
+        if (IS_ERR(req->r_reply)) {
+                err = PTR_ERR(req->r_reply);
+                req->r_reply = NULL;
+                if (err == -ERESTARTSYS) {
+                        /* aborted */
+                        req->r_aborted = true;
+                        if (req->r_locked_dir &&
+                            (req->r_op & CEPH_MDS_OP_WRITE)) {
+                                struct ceph_inode_info *ci =
+                                        ceph_inode(req->r_locked_dir);
+                                dout("aborted, clearing I_COMPLETE on %p\n", 
+                                     req->r_locked_dir);
+                                spin_lock(&req->r_locked_dir->i_lock);
+                                ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+                                ci->i_release_count++;
+                                spin_unlock(&req->r_locked_dir->i_lock);
+                        }
+                } else {
+                        /* clean up this request */
+                        __unregister_request(mdsc, req);
+                        if (!list_empty(&req->r_unsafe_item))
+                                list_del_init(&req->r_unsafe_item);
+                        complete(&req->r_safe_completion);
+                }
+        } else if (req->r_err) {
+                err = req->r_err;
+        } else {
+                err = le32_to_cpu(req->r_reply_info.head->result);
+        }
+        mutex_unlock(&mdsc->mutex);
+        dout("do_request %p done, result %d\n", req, err);
+        return err;
+}
+/*
+ * Handle mds reply.
+ *
+ * We take the session mutex and parse and process the reply immediately.
+ * This preserves the logical ordering of replies, capabilities, etc., sent
+ * by the MDS as they are applied to our local cache.
+ */
+static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
+{
+        struct ceph_mds_client *mdsc = session->s_mdsc;
+        struct ceph_mds_request *req;
+        struct ceph_mds_reply_head *head = msg->front.iov_base;
+        struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
+        u64 tid;
+        int err, result;
+        int mds = session->s_mds;
+        if (msg->front.iov_len < sizeof(*head)) {
+                pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+                ceph_msg_dump(msg);
+                return;
+        }
+        /* get request, session */
+        tid = le64_to_cpu(msg->hdr.tid);
+        mutex_lock(&mdsc->mutex);
+        req = __lookup_request(mdsc, tid);
+        if (!req) {
+                dout("handle_reply on unknown tid %llu\n", tid);
+                mutex_unlock(&mdsc->mutex);
+                return;
+        }
+        dout("handle_reply %p\n", req);
+        /* correct session? */
+        if (req->r_session != session) {
+                pr_err("mdsc_handle_reply got %llu on session mds%d"
+                       " not mds%d\n", tid, session->s_mds,
+                       req->r_session ? req->r_session->s_mds : -1);
+                mutex_unlock(&mdsc->mutex);
+                goto out;
+        }
+        /* dup? */
+        if ((req->r_got_unsafe && !head->safe) ||
+            (req->r_got_safe && head->safe)) {
+                pr_warning("got a dup %s reply on %llu from mds%d\n",
+                           head->safe ? "safe" : "unsafe", tid, mds);
+                mutex_unlock(&mdsc->mutex);
+                goto out;
+        }
+        result = le32_to_cpu(head->result);
+        /*
+         * Tolerate 2 consecutive ESTALEs from the same mds.
+         * FIXME: we should be looking at the cap migrate_seq.
+         */
+        if (result == -ESTALE) {
+                req->r_direct_mode = USE_AUTH_MDS;
+                req->r_num_stale++;
+                if (req->r_num_stale <= 2) {
+                        __do_request(mdsc, req);
+                        mutex_unlock(&mdsc->mutex);
+                        goto out;
+                }
+        } else {
+                req->r_num_stale = 0;
+        }
+        if (head->safe) {
+                req->r_got_safe = true;
+                __unregister_request(mdsc, req);
+                complete(&req->r_safe_completion);
+                if (req->r_got_unsafe) {
+                        /*
+                         * We already handled the unsafe response, now do the
+                         * cleanup.  No need to examine the response; the MDS
+                         * doesn't include any result info in the safe
+                         * response.  And even if it did, there is nothing
+                         * useful we could do with a revised return value.
+                         */
+                        dout("got safe reply %llu, mds%d\n", tid, mds);
+                        list_del_init(&req->r_unsafe_item);
+                        /* last unsafe request during umount? */
+                        if (mdsc->stopping && !__get_oldest_req(mdsc))
+                                complete(&mdsc->safe_umount_waiters);
+                        mutex_unlock(&mdsc->mutex);
+                        goto out;
+                }
+        }
+        BUG_ON(req->r_reply);
+        if (!head->safe) {
+                req->r_got_unsafe = true;
+                list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+        }
+        dout("handle_reply tid %lld result %d\n", tid, result);
+        rinfo = &req->r_reply_info;
+        err = parse_reply_info(msg, rinfo);
+        mutex_unlock(&mdsc->mutex);
+        mutex_lock(&session->s_mutex);
+        if (err < 0) {
+                pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+                ceph_msg_dump(msg);
+                goto out_err;
+        }
+        /* snap trace */
+        if (rinfo->snapblob_len) {
+                down_write(&mdsc->snap_rwsem);
+                ceph_update_snap_trace(mdsc, rinfo->snapblob,
+                               rinfo->snapblob + rinfo->snapblob_len,
+                               le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
+                downgrade_write(&mdsc->snap_rwsem);
+        } else {
+                down_read(&mdsc->snap_rwsem);
+        }
+        /* insert trace into our cache */
+        err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+        if (err == 0) {
+                if (result == 0 && rinfo->dir_nr)
+                        ceph_readdir_prepopulate(req, req->r_session);
+                ceph_unreserve_caps(&req->r_caps_reservation);
+        }
+        up_read(&mdsc->snap_rwsem);
+out_err:
+        if (err) {
+                req->r_err = err;
+        } else {
+                req->r_reply = msg;
+                ceph_msg_get(msg);
+        }
+        add_cap_releases(mdsc, req->r_session, -1);
+        mutex_unlock(&session->s_mutex);
+        /* kick calling process */
+        complete_request(mdsc, req);
+out:
+        ceph_mdsc_put_request(req);
+        return;
+}
+/*
+ * handle mds notification that our request has been forwarded.
+ */
+static void handle_forward(struct ceph_mds_client *mdsc,
+                           struct ceph_mds_session *session,
+                           struct ceph_msg *msg)
+{
+        struct ceph_mds_request *req;
+        u64 tid = le64_to_cpu(msg->hdr.tid);
+        u32 next_mds;
+        u32 fwd_seq;
+        int err = -EINVAL;
+        void *p = msg->front.iov_base;
+        void *end = p + msg->front.iov_len;
+        ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+        next_mds = ceph_decode_32(&p);
+        fwd_seq = ceph_decode_32(&p);
+        mutex_lock(&mdsc->mutex);
+        req = __lookup_request(mdsc, tid);
+        if (!req) {
+                dout("forward %llu to mds%d - req dne\n", tid, next_mds);
+                goto out;  /* dup reply? */
+        }
+        if (fwd_seq <= req->r_num_fwd) {
+                dout("forward %llu to mds%d - old seq %d <= %d\n",
+                     tid, next_mds, req->r_num_fwd, fwd_seq);
+        } else {
+                /* resend. forward race not possible; mds would drop */
+                dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
+                req->r_num_fwd = fwd_seq;
+                req->r_resend_mds = next_mds;
+                put_request_session(req);
+                __do_request(mdsc, req);
+        }
+        ceph_mdsc_put_request(req);
+out:
+        mutex_unlock(&mdsc->mutex);
+        return;
+bad:
+        pr_err("mdsc_handle_forward decode error err=%d\n", err);
+}
+/*
+ * handle a mds session control message
+ */
+static void handle_session(struct ceph_mds_session *session,
+                           struct ceph_msg *msg)
+{
+        struct ceph_mds_client *mdsc = session->s_mdsc;
+        u32 op;
+        u64 seq;
+        int mds = session->s_mds;
+        struct ceph_mds_session_head *h = msg->front.iov_base;
+        int wake = 0;
+        /* decode */
+        if (msg->front.iov_len != sizeof(*h))
+                goto bad;
+        op = le32_to_cpu(h->op);
+        seq = le64_to_cpu(h->seq);
+        mutex_lock(&mdsc->mutex);
+        if (op == CEPH_SESSION_CLOSE)
+                __unregister_session(mdsc, session);
+        /* FIXME: this ttl calculation is generous */
+        session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
+        mutex_unlock(&mdsc->mutex);
+        mutex_lock(&session->s_mutex);
+        dout("handle_session mds%d %s %p state %s seq %llu\n",
+             mds, ceph_session_op_name(op), session,
+             session_state_name(session->s_state), seq);
+        if (session->s_state == CEPH_MDS_SESSION_HUNG) {
+                session->s_state = CEPH_MDS_SESSION_OPEN;
+                pr_info("mds%d came back\n", session->s_mds);
+        }
+        switch (op) {
+        case CEPH_SESSION_OPEN:
+                session->s_state = CEPH_MDS_SESSION_OPEN;
+                renewed_caps(mdsc, session, 0);
+                wake = 1;
+                if (mdsc->stopping)
+                        __close_session(mdsc, session);
+                break;
+        case CEPH_SESSION_RENEWCAPS:
+                if (session->s_renew_seq == seq)
+                        renewed_caps(mdsc, session, 1);
+                break;
+        case CEPH_SESSION_CLOSE:
+                remove_session_caps(session);
+                wake = 1; /* for good measure */
+                complete(&mdsc->session_close_waiters);
+                kick_requests(mdsc, mds, 0);      /* cur only */
+                break;
+        case CEPH_SESSION_STALE:
+                pr_info("mds%d caps went stale, renewing\n",
+                        session->s_mds);
+                spin_lock(&session->s_cap_lock);
+                session->s_cap_gen++;
+                session->s_cap_ttl = 0;
+                spin_unlock(&session->s_cap_lock);
+                send_renew_caps(mdsc, session);
+                break;
+        case CEPH_SESSION_RECALL_STATE:
+                trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
+                break;
+        default:
+                pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
+                WARN_ON(1);
+        }
+        mutex_unlock(&session->s_mutex);
+        if (wake) {
+                mutex_lock(&mdsc->mutex);
+                __wake_requests(mdsc, &session->s_waiting);
+                mutex_unlock(&mdsc->mutex);
+        }
+        return;
+bad:
+        pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
+               (int)msg->front.iov_len);
+        ceph_msg_dump(msg);
+        return;
+}
+/*
+ * called under session->mutex.
+ */
+static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
+                                   struct ceph_mds_session *session)
+{
+        struct ceph_mds_request *req, *nreq;
+        int err;
+        dout("replay_unsafe_requests mds%d\n", session->s_mds);
+        mutex_lock(&mdsc->mutex);
+        list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
+                err = __prepare_send_request(mdsc, req, session->s_mds);
+                if (!err) {
+                        ceph_msg_get(req->r_request);
+                        ceph_con_send(&session->s_con, req->r_request);
+                }
+        }
+        mutex_unlock(&mdsc->mutex);
+}
+/*
+ * Encode information about a cap for a reconnect with the MDS.
+ */
+static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+                          void *arg)
+{
+        struct ceph_mds_cap_reconnect rec;
+        struct ceph_inode_info *ci;
+        struct ceph_pagelist *pagelist = arg;
+        char *path;
+        int pathlen, err;
+        u64 pathbase;
+        struct dentry *dentry;
+        ci = cap->ci;
+        dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
+             inode, ceph_vinop(inode), cap, cap->cap_id,
+             ceph_cap_string(cap->issued));
+        err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
+        if (err)
+                return err;
+        dentry = d_find_alias(inode);
+        if (dentry) {
+                path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
+                if (IS_ERR(path)) {
+                        err = PTR_ERR(path);
+                        BUG_ON(err);
+                }
+        } else {
+                path = NULL;
+                pathlen = 0;
+        }
+        err = ceph_pagelist_encode_string(pagelist, path, pathlen);
+        if (err)
+                goto out;
+        spin_lock(&inode->i_lock);
+        cap->seq = 0;        /* reset cap seq */
+        cap->issue_seq = 0;  /* and issue_seq */
+        rec.cap_id = cpu_to_le64(cap->cap_id);
+        rec.pathbase = cpu_to_le64(pathbase);
+        rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+        rec.issued = cpu_to_le32(cap->issued);
+        rec.size = cpu_to_le64(inode->i_size);
+        ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
+        ceph_encode_timespec(&rec.atime, &inode->i_atime);
+        rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+        spin_unlock(&inode->i_lock);
+        err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
+out:
+        kfree(path);
+        dput(dentry);
+        return err;
+}
+/*
+ * If an MDS fails and recovers, clients need to reconnect in order to
+ * reestablish shared state.  This includes all caps issued through
+ * this session _and_ the snap_realm hierarchy.  Because it's not
+ * clear which snap realms the mds cares about, we send everything we
+ * know about.. that ensures we'll then get any new info the
+ * recovering MDS might have.
+ *
+ * This is a relatively heavyweight operation, but it's rare.
+ *
+ * called with mdsc->mutex held.
+ */
+static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
+{
+        struct ceph_mds_session *session = NULL;
+        struct ceph_msg *reply;
+        struct rb_node *p;
+        int err = -ENOMEM;
+        struct ceph_pagelist *pagelist;
+        pr_info("reconnect to recovering mds%d\n", mds);
+        pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+        if (!pagelist)
+                goto fail_nopagelist;
+        ceph_pagelist_init(pagelist);
+        reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
+        if (IS_ERR(reply)) {
+                err = PTR_ERR(reply);
+                goto fail_nomsg;
+        }
+        /* find session */
+        session = __ceph_lookup_mds_session(mdsc, mds);
+        mutex_unlock(&mdsc->mutex);    /* drop lock for duration */
+        if (session) {
+                mutex_lock(&session->s_mutex);
+                session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+                session->s_seq = 0;
+                ceph_con_open(&session->s_con,
+                              ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+                /* replay unsafe requests */
+                replay_unsafe_requests(mdsc, session);
+        } else {
+                dout("no session for mds%d, will send short reconnect\n",
+                     mds);
+        }
+        down_read(&mdsc->snap_rwsem);
+        if (!session)
+                goto send;
+        dout("session %p state %s\n", session,
+             session_state_name(session->s_state));
+        /* traverse this session's caps */
+        err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
+        if (err)
+                goto fail;
+        err = iterate_session_caps(session, encode_caps_cb, pagelist);
+        if (err < 0)
+                goto fail;
+        /*
+         * snaprealms.  we provide mds with the ino, seq (version), and
+         * parent for all of our realms.  If the mds has any newer info,
+         * it will tell us.
+         */
+        for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+                struct ceph_snap_realm *realm =
+                        rb_entry(p, struct ceph_snap_realm, node);
+                struct ceph_mds_snaprealm_reconnect sr_rec;
+                dout(" adding snap realm %llx seq %lld parent %llx\n",
+                     realm->ino, realm->seq, realm->parent_ino);
+                sr_rec.ino = cpu_to_le64(realm->ino);
+                sr_rec.seq = cpu_to_le64(realm->seq);
+                sr_rec.parent = cpu_to_le64(realm->parent_ino);
+                err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+                if (err)
+                        goto fail;
+        }
+send:
+        reply->pagelist = pagelist;
+        reply->hdr.data_len = cpu_to_le32(pagelist->length);
+        reply->nr_pages = calc_pages_for(0, pagelist->length);
+        ceph_con_send(&session->s_con, reply);
+        session->s_state = CEPH_MDS_SESSION_OPEN;
+        mutex_unlock(&session->s_mutex);
+        mutex_lock(&mdsc->mutex);
+        __wake_requests(mdsc, &session->s_waiting);
+        mutex_unlock(&mdsc->mutex);
+        ceph_put_mds_session(session);
+        up_read(&mdsc->snap_rwsem);
+        mutex_lock(&mdsc->mutex);
+        return;
+fail:
+        ceph_msg_put(reply);
+        up_read(&mdsc->snap_rwsem);
+        mutex_unlock(&session->s_mutex);
+        ceph_put_mds_session(session);
+fail_nomsg:
+        ceph_pagelist_release(pagelist);
+        kfree(pagelist);
+fail_nopagelist:
+        pr_err("error %d preparing reconnect for mds%d\n", err, mds);
+        mutex_lock(&mdsc->mutex);
+        return;
+}
+/*
+ * compare old and new mdsmaps, kicking requests
+ * and closing out old connections as necessary
+ *
+ * called under mdsc->mutex.
+ */
+static void check_new_map(struct ceph_mds_client *mdsc,
+                          struct ceph_mdsmap *newmap,
+                          struct ceph_mdsmap *oldmap)
+{
+        int i;
+        int oldstate, newstate;
+        struct ceph_mds_session *s;
+        dout("check_new_map new %u old %u\n",
+             newmap->m_epoch, oldmap->m_epoch);
+        for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
+                if (mdsc->sessions[i] == NULL)
+                        continue;
+                s = mdsc->sessions[i];
+                oldstate = ceph_mdsmap_get_state(oldmap, i);
+                newstate = ceph_mdsmap_get_state(newmap, i);
+                dout("check_new_map mds%d state %s -> %s (session %s)\n",
+                     i, ceph_mds_state_name(oldstate),
+                     ceph_mds_state_name(newstate),
+                     session_state_name(s->s_state));
+                if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
+                           ceph_mdsmap_get_addr(newmap, i),
+                           sizeof(struct ceph_entity_addr))) {
+                        if (s->s_state == CEPH_MDS_SESSION_OPENING) {
+                                /* the session never opened, just close it
+                                 * out now */
+                                __wake_requests(mdsc, &s->s_waiting);
+                                __unregister_session(mdsc, s);
+                        } else {
+                                /* just close it */
+                                mutex_unlock(&mdsc->mutex);
+                                mutex_lock(&s->s_mutex);
+                                mutex_lock(&mdsc->mutex);
+                                ceph_con_close(&s->s_con);
+                                mutex_unlock(&s->s_mutex);
+                                s->s_state = CEPH_MDS_SESSION_RESTARTING;
+                        }
+                        /* kick any requests waiting on the recovering mds */
+                        kick_requests(mdsc, i, 1);
+                } else if (oldstate == newstate) {
+                        continue;  /* nothing new with this mds */
+                }
+                /*
+                 * send reconnect?
+                 */
+                if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
+                    newstate >= CEPH_MDS_STATE_RECONNECT)
+                        send_mds_reconnect(mdsc, i);
+                /*
+                 * kick requests on any mds that has gone active.
+                 *
+                 * kick requests on cur or forwarder: we may have sent
+                 * the request to mds1, mds1 told us it forwarded it
+                 * to mds2, but then we learn mds1 failed and can't be
+                 * sure it successfully forwarded our request before
+                 * it died.
+                 */
+                if (oldstate < CEPH_MDS_STATE_ACTIVE &&
+                    newstate >= CEPH_MDS_STATE_ACTIVE) {
+                        pr_info("mds%d reconnect completed\n", s->s_mds);
+                        kick_requests(mdsc, i, 1);
+                        ceph_kick_flushing_caps(mdsc, s);
+                        wake_up_session_caps(s, 1);
+                }
+        }
+}
+/*
+ * leases
+ */
+/*
+ * caller must hold session s_mutex, dentry->d_lock
+ */
+void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
+{
+        struct ceph_dentry_info *di = ceph_dentry(dentry);
+        ceph_put_mds_session(di->lease_session);
+        di->lease_session = NULL;
+}
+static void handle_lease(struct ceph_mds_client *mdsc,
+                         struct ceph_mds_session *session,
+                         struct ceph_msg *msg)
+{
+        struct super_block *sb = mdsc->client->sb;
+        struct inode *inode;
+        struct ceph_inode_info *ci;
+        struct dentry *parent, *dentry;
+        struct ceph_dentry_info *di;
+        int mds = session->s_mds;
+        struct ceph_mds_lease *h = msg->front.iov_base;
+        struct ceph_vino vino;
+        int mask;
+        struct qstr dname;
+        int release = 0;
+        dout("handle_lease from mds%d\n", mds);
+        /* decode */
+        if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
+                goto bad;
+        vino.ino = le64_to_cpu(h->ino);
+        vino.snap = CEPH_NOSNAP;
+        mask = le16_to_cpu(h->mask);
+        dname.name = (void *)h + sizeof(*h) + sizeof(u32);
+        dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
+        if (dname.len != get_unaligned_le32(h+1))
+                goto bad;
+        mutex_lock(&session->s_mutex);
+        session->s_seq++;
+        /* lookup inode */
+        inode = ceph_find_inode(sb, vino);
+        dout("handle_lease '%s', mask %d, ino %llx %p\n",
+             ceph_lease_op_name(h->action), mask, vino.ino, inode);
+        if (inode == NULL) {
+                dout("handle_lease no inode %llx\n", vino.ino);
+                goto release;
+        }
+        ci = ceph_inode(inode);
+        /* dentry */
+        parent = d_find_alias(inode);
+        if (!parent) {
+                dout("no parent dentry on inode %p\n", inode);
+                WARN_ON(1);
+                goto release;  /* hrm... */
+        }
+        dname.hash = full_name_hash(dname.name, dname.len);
+        dentry = d_lookup(parent, &dname);
+        dput(parent);
+        if (!dentry)
+                goto release;
+        spin_lock(&dentry->d_lock);
+        di = ceph_dentry(dentry);
+        switch (h->action) {
+        case CEPH_MDS_LEASE_REVOKE:
+                if (di && di->lease_session == session) {
+                        h->seq = cpu_to_le32(di->lease_seq);
+                        __ceph_mdsc_drop_dentry_lease(dentry);
+                }
+                release = 1;
+                break;
+        case CEPH_MDS_LEASE_RENEW:
+                if (di && di->lease_session == session &&
+                    di->lease_gen == session->s_cap_gen &&
+                    di->lease_renew_from &&
+                    di->lease_renew_after == 0) {
+                        unsigned long duration =
+                                le32_to_cpu(h->duration_ms) * HZ / 1000;
+                        di->lease_seq = le32_to_cpu(h->seq);
+                        dentry->d_time = di->lease_renew_from + duration;
+                        di->lease_renew_after = di->lease_renew_from +
+                                (duration >> 1);
+                        di->lease_renew_from = 0;
+                }
+                break;
+        }
+        spin_unlock(&dentry->d_lock);
+        dput(dentry);
+        if (!release)
+                goto out;
+release:
+        /* let's just reuse the same message */
+        h->action = CEPH_MDS_LEASE_REVOKE_ACK;
+        ceph_msg_get(msg);
+        ceph_con_send(&session->s_con, msg);
+out:
+        iput(inode);
+        mutex_unlock(&session->s_mutex);
+        return;
+bad:
+        pr_err("corrupt lease message\n");
+        ceph_msg_dump(msg);
+}
+void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+                              struct inode *inode,
+                              struct dentry *dentry, char action,
+                              u32 seq)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_lease *lease;
+        int len = sizeof(*lease) + sizeof(u32);
+        int dnamelen = 0;
+        dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
+             inode, dentry, ceph_lease_op_name(action), session->s_mds);
+        dnamelen = dentry->d_name.len;
+        len += dnamelen;
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
+        if (IS_ERR(msg))
+                return;
+        lease = msg->front.iov_base;
+        lease->action = action;
+        lease->mask = cpu_to_le16(CEPH_LOCK_DN);
+        lease->ino = cpu_to_le64(ceph_vino(inode).ino);
+        lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
+        lease->seq = cpu_to_le32(seq);
+        put_unaligned_le32(dnamelen, lease + 1);
+        memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
+        /*
+         * if this is a preemptive lease RELEASE, no need to
+         * flush request stream, since the actual request will
+         * soon follow.
+         */
+        msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
+        ceph_con_send(&session->s_con, msg);
+}
+/*
+ * Preemptively release a lease we expect to invalidate anyway.
+ * Pass @inode always, @dentry is optional.
+ */
+void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
+                             struct dentry *dentry, int mask)
+{
+        struct ceph_dentry_info *di;
+        struct ceph_mds_session *session;
+        u32 seq;
+        BUG_ON(inode == NULL);
+        BUG_ON(dentry == NULL);
+        BUG_ON(mask != CEPH_LOCK_DN);
+        /* is dentry lease valid? */
+        spin_lock(&dentry->d_lock);
+        di = ceph_dentry(dentry);
+        if (!di || !di->lease_session ||
+            di->lease_session->s_mds < 0 ||
+            di->lease_gen != di->lease_session->s_cap_gen ||
+            !time_before(jiffies, dentry->d_time)) {
+                dout("lease_release inode %p dentry %p -- "
+                     "no lease on %d\n",
+                     inode, dentry, mask);
+                spin_unlock(&dentry->d_lock);
+                return;
+        }
+        /* we do have a lease on this dentry; note mds and seq */
+        session = ceph_get_mds_session(di->lease_session);
+        seq = di->lease_seq;
+        __ceph_mdsc_drop_dentry_lease(dentry);
+        spin_unlock(&dentry->d_lock);
+        dout("lease_release inode %p dentry %p mask %d to mds%d\n",
+             inode, dentry, mask, session->s_mds);
+        ceph_mdsc_lease_send_msg(session, inode, dentry,
+                                 CEPH_MDS_LEASE_RELEASE, seq);
+        ceph_put_mds_session(session);
+}
+/*
+ * drop all leases (and dentry refs) in preparation for umount
+ */
+static void drop_leases(struct ceph_mds_client *mdsc)
+{
+        int i;
+        dout("drop_leases\n");
+        mutex_lock(&mdsc->mutex);
+        for (i = 0; i < mdsc->max_sessions; i++) {
+                struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+                if (!s)
+                        continue;
+                mutex_unlock(&mdsc->mutex);
+                mutex_lock(&s->s_mutex);
+                mutex_unlock(&s->s_mutex);
+                ceph_put_mds_session(s);
+                mutex_lock(&mdsc->mutex);
+        }
+        mutex_unlock(&mdsc->mutex);
+}
+/*
+ * delayed work -- periodically trim expired leases, renew caps with mds
+ */
+static void schedule_delayed(struct ceph_mds_client *mdsc)
+{
+        int delay = 5;
+        unsigned hz = round_jiffies_relative(HZ * delay);
+        schedule_delayed_work(&mdsc->delayed_work, hz);
+}
+static void delayed_work(struct work_struct *work)
+{
+        int i;
+        struct ceph_mds_client *mdsc =
+                container_of(work, struct ceph_mds_client, delayed_work.work);
+        int renew_interval;
+        int renew_caps;
+        dout("mdsc delayed_work\n");
+        ceph_check_delayed_caps(mdsc);
+        mutex_lock(&mdsc->mutex);
+        renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
+        renew_caps = time_after_eq(jiffies, HZ*renew_interval +
+                                   mdsc->last_renew_caps);
+        if (renew_caps)
+                mdsc->last_renew_caps = jiffies;
+        for (i = 0; i < mdsc->max_sessions; i++) {
+                struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+                if (s == NULL)
+                        continue;
+                if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+                        dout("resending session close request for mds%d\n",
+                             s->s_mds);
+                        request_close_session(mdsc, s);
+                        ceph_put_mds_session(s);
+                        continue;
+                }
+                if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+                        if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+                                s->s_state = CEPH_MDS_SESSION_HUNG;
+                                pr_info("mds%d hung\n", s->s_mds);
+                        }
+                }
+                if (s->s_state < CEPH_MDS_SESSION_OPEN) {
+                        /* this mds is failed or recovering, just wait */
+                        ceph_put_mds_session(s);
+                        continue;
+                }
+                mutex_unlock(&mdsc->mutex);
+                mutex_lock(&s->s_mutex);
+                if (renew_caps)
+                        send_renew_caps(mdsc, s);
+                else
+                        ceph_con_keepalive(&s->s_con);
+                add_cap_releases(mdsc, s, -1);
+                send_cap_releases(mdsc, s);
+                mutex_unlock(&s->s_mutex);
+                ceph_put_mds_session(s);
+                mutex_lock(&mdsc->mutex);
+        }
+        mutex_unlock(&mdsc->mutex);
+        schedule_delayed(mdsc);
+}
+int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
+{
+        mdsc->client = client;
+        mutex_init(&mdsc->mutex);
+        mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+        init_completion(&mdsc->safe_umount_waiters);
+        init_completion(&mdsc->session_close_waiters);
+        INIT_LIST_HEAD(&mdsc->waiting_for_map);
+        mdsc->sessions = NULL;
+        mdsc->max_sessions = 0;
+        mdsc->stopping = 0;
+        init_rwsem(&mdsc->snap_rwsem);
+        mdsc->snap_realms = RB_ROOT;
+        INIT_LIST_HEAD(&mdsc->snap_empty);
+        spin_lock_init(&mdsc->snap_empty_lock);
+        mdsc->last_tid = 0;
+        mdsc->request_tree = RB_ROOT;
+        INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
+        mdsc->last_renew_caps = jiffies;
+        INIT_LIST_HEAD(&mdsc->cap_delay_list);
+        spin_lock_init(&mdsc->cap_delay_lock);
+        INIT_LIST_HEAD(&mdsc->snap_flush_list);
+        spin_lock_init(&mdsc->snap_flush_lock);
+        mdsc->cap_flush_seq = 0;
+        INIT_LIST_HEAD(&mdsc->cap_dirty);
+        mdsc->num_cap_flushing = 0;
+        spin_lock_init(&mdsc->cap_dirty_lock);
+        init_waitqueue_head(&mdsc->cap_flushing_wq);
+        spin_lock_init(&mdsc->dentry_lru_lock);
+        INIT_LIST_HEAD(&mdsc->dentry_lru);
+        return 0;
+}
+/*
+ * Wait for safe replies on open mds requests.  If we time out, drop
+ * all requests from the tree to avoid dangling dentry refs.
+ */
+static void wait_requests(struct ceph_mds_client *mdsc)
+{
+        struct ceph_mds_request *req;
+        struct ceph_client *client = mdsc->client;
+        mutex_lock(&mdsc->mutex);
+        if (__get_oldest_req(mdsc)) {
+                mutex_unlock(&mdsc->mutex);
+                dout("wait_requests waiting for requests\n");
+                wait_for_completion_timeout(&mdsc->safe_umount_waiters,
+                                    client->mount_args->mount_timeout * HZ);
+                /* tear down remaining requests */
+                mutex_lock(&mdsc->mutex);
+                while ((req = __get_oldest_req(mdsc))) {
+                        dout("wait_requests timed out on tid %llu\n",
+                             req->r_tid);
+                        __unregister_request(mdsc, req);
+                }
+        }
+        mutex_unlock(&mdsc->mutex);
+        dout("wait_requests done\n");
+}
+/*
+ * called before mount is ro, and before dentries are torn down.
+ * (hmm, does this still race with new lookups?)
+ */
+void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
+{
+        dout("pre_umount\n");
+        mdsc->stopping = 1;
+        drop_leases(mdsc);
+        ceph_flush_dirty_caps(mdsc);
+        wait_requests(mdsc);
+}
+/*
+ * wait for all write mds requests to flush.
+ */
+static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+{
+        struct ceph_mds_request *req = NULL, *nextreq;
+        struct rb_node *n;
+        mutex_lock(&mdsc->mutex);
+        dout("wait_unsafe_requests want %lld\n", want_tid);
+restart:
+        req = __get_oldest_req(mdsc);
+        while (req && req->r_tid <= want_tid) {
+                /* find next request */
+                n = rb_next(&req->r_node);
+                if (n)
+                        nextreq = rb_entry(n, struct ceph_mds_request, r_node);
+                else
+                        nextreq = NULL;
+                if ((req->r_op & CEPH_MDS_OP_WRITE)) {
+                        /* write op */
+                        ceph_mdsc_get_request(req);
+                        if (nextreq)
+                                ceph_mdsc_get_request(nextreq);
+                        mutex_unlock(&mdsc->mutex);
+                        dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
+                             req->r_tid, want_tid);
+                        wait_for_completion(&req->r_safe_completion);
+                        mutex_lock(&mdsc->mutex);
+                        ceph_mdsc_put_request(req);
+                        if (!nextreq)
+                                break;  /* next dne before, so we're done! */
+                        if (RB_EMPTY_NODE(&nextreq->r_node)) {
+                                /* next request was removed from tree */
+                                ceph_mdsc_put_request(nextreq);
+                                goto restart;
+                        }
+                        ceph_mdsc_put_request(nextreq);  /* won't go away */
+                }
+                req = nextreq;
+        }
+        mutex_unlock(&mdsc->mutex);
+        dout("wait_unsafe_requests done\n");
+}
+void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
+{
+        u64 want_tid, want_flush;
+        dout("sync\n");
+        mutex_lock(&mdsc->mutex);
+        want_tid = mdsc->last_tid;
+        want_flush = mdsc->cap_flush_seq;
+        mutex_unlock(&mdsc->mutex);
+        dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
+        ceph_flush_dirty_caps(mdsc);
+        wait_unsafe_requests(mdsc, want_tid);
+        wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
+}
+/*
+ * called after sb is ro.
+ */
+void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
+{
+        struct ceph_mds_session *session;
+        int i;
+        int n;
+        struct ceph_client *client = mdsc->client;
+        unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
+        dout("close_sessions\n");
+        mutex_lock(&mdsc->mutex);
+        /* close sessions */
+        started = jiffies;
+        while (time_before(jiffies, started + timeout)) {
+                dout("closing sessions\n");
+                n = 0;
+                for (i = 0; i < mdsc->max_sessions; i++) {
+                        session = __ceph_lookup_mds_session(mdsc, i);
+                        if (!session)
+                                continue;
+                        mutex_unlock(&mdsc->mutex);
+                        mutex_lock(&session->s_mutex);
+                        __close_session(mdsc, session);
+                        mutex_unlock(&session->s_mutex);
+                        ceph_put_mds_session(session);
+                        mutex_lock(&mdsc->mutex);
+                        n++;
+                }
+                if (n == 0)
+                        break;
+                if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
+                        break;
+                dout("waiting for sessions to close\n");
+                mutex_unlock(&mdsc->mutex);
+                wait_for_completion_timeout(&mdsc->session_close_waiters,
+                                            timeout);
+                mutex_lock(&mdsc->mutex);
+        }
+        /* tear down remaining sessions */
+        for (i = 0; i < mdsc->max_sessions; i++) {
+                if (mdsc->sessions[i]) {
+                        session = get_session(mdsc->sessions[i]);
+                        __unregister_session(mdsc, session);
+                        mutex_unlock(&mdsc->mutex);
+                        mutex_lock(&session->s_mutex);
+                        remove_session_caps(session);
+                        mutex_unlock(&session->s_mutex);
+                        ceph_put_mds_session(session);
+                        mutex_lock(&mdsc->mutex);
+                }
+        }
+        WARN_ON(!list_empty(&mdsc->cap_delay_list));
+        mutex_unlock(&mdsc->mutex);
+        ceph_cleanup_empty_realms(mdsc);
+        cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+        dout("stopped\n");
+}
+void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+{
+        dout("stop\n");
+        cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+        if (mdsc->mdsmap)
+                ceph_mdsmap_destroy(mdsc->mdsmap);
+        kfree(mdsc->sessions);
+}
+/*
+ * handle mds map update.
+ */
+void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+        u32 epoch;
+        u32 maplen;
+        void *p = msg->front.iov_base;
+        void *end = p + msg->front.iov_len;
+        struct ceph_mdsmap *newmap, *oldmap;
+        struct ceph_fsid fsid;
+        int err = -EINVAL;
+        ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
+        ceph_decode_copy(&p, &fsid, sizeof(fsid));
+        if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+                return;
+        epoch = ceph_decode_32(&p);
+        maplen = ceph_decode_32(&p);
+        dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
+        /* do we need it? */
+        ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+        mutex_lock(&mdsc->mutex);
+        if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
+                dout("handle_map epoch %u <= our %u\n",
+                     epoch, mdsc->mdsmap->m_epoch);
+                mutex_unlock(&mdsc->mutex);
+                return;
+        }
+        newmap = ceph_mdsmap_decode(&p, end);
+        if (IS_ERR(newmap)) {
+                err = PTR_ERR(newmap);
+                goto bad_unlock;
+        }
+        /* swap into place */
+        if (mdsc->mdsmap) {
+                oldmap = mdsc->mdsmap;
+                mdsc->mdsmap = newmap;
+                check_new_map(mdsc, newmap, oldmap);
+                ceph_mdsmap_destroy(oldmap);
+        } else {
+                mdsc->mdsmap = newmap;  /* first mds map */
+        }
+        mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+        __wake_requests(mdsc, &mdsc->waiting_for_map);
+        mutex_unlock(&mdsc->mutex);
+        schedule_delayed(mdsc);
+        return;
+bad_unlock:
+        mutex_unlock(&mdsc->mutex);
+bad:
+        pr_err("error decoding mdsmap %d\n", err);
+        return;
+}
+static struct ceph_connection *con_get(struct ceph_connection *con)
+{
+        struct ceph_mds_session *s = con->private;
+        if (get_session(s)) {
+                dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
+                return con;
+        }
+        dout("mdsc con_get %p FAIL\n", s);
+        return NULL;
+}
+static void con_put(struct ceph_connection *con)
+{
+        struct ceph_mds_session *s = con->private;
+        ceph_put_mds_session(s);
+        dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
+}
+/*
+ * if the client is unresponsive for long enough, the mds will kill
+ * the session entirely.
+ */
+static void peer_reset(struct ceph_connection *con)
+{
+        struct ceph_mds_session *s = con->private;
+        pr_err("mds%d gave us the boot.  IMPLEMENT RECONNECT.\n",
+               s->s_mds);
+}
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        struct ceph_mds_session *s = con->private;
+        struct ceph_mds_client *mdsc = s->s_mdsc;
+        int type = le16_to_cpu(msg->hdr.type);
+        mutex_lock(&mdsc->mutex);
+        if (__verify_registered_session(mdsc, s) < 0) {
+                mutex_unlock(&mdsc->mutex);
+                goto out;
+        }
+        mutex_unlock(&mdsc->mutex);
+        switch (type) {
+        case CEPH_MSG_MDS_MAP:
+                ceph_mdsc_handle_map(mdsc, msg);
+                break;
+        case CEPH_MSG_CLIENT_SESSION:
+                handle_session(s, msg);
+                break;
+        case CEPH_MSG_CLIENT_REPLY:
+                handle_reply(s, msg);
+                break;
+        case CEPH_MSG_CLIENT_REQUEST_FORWARD:
+                handle_forward(mdsc, s, msg);
+                break;
+        case CEPH_MSG_CLIENT_CAPS:
+                ceph_handle_caps(s, msg);
+                break;
+        case CEPH_MSG_CLIENT_SNAP:
+                ceph_handle_snap(mdsc, s, msg);
+                break;
+        case CEPH_MSG_CLIENT_LEASE:
+                handle_lease(mdsc, s, msg);
+                break;
+        default:
+                pr_err("received unknown message type %d %s\n", type,
+                       ceph_msg_type_name(type));
+        }
+out:
+        ceph_msg_put(msg);
+}
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+                          void **buf, int *len, int *proto,
+                          void **reply_buf, int *reply_len, int force_new)
+{
+        struct ceph_mds_session *s = con->private;
+        struct ceph_mds_client *mdsc = s->s_mdsc;
+        struct ceph_auth_client *ac = mdsc->client->monc.auth;
+        int ret = 0;
+        if (force_new && s->s_authorizer) {
+                ac->ops->destroy_authorizer(ac, s->s_authorizer);
+                s->s_authorizer = NULL;
+        }
+        if (s->s_authorizer == NULL) {
+                if (ac->ops->create_authorizer) {
+                        ret = ac->ops->create_authorizer(
+                                ac, CEPH_ENTITY_TYPE_MDS,
+                                &s->s_authorizer,
+                                &s->s_authorizer_buf,
+                                &s->s_authorizer_buf_len,
+                                &s->s_authorizer_reply_buf,
+                                &s->s_authorizer_reply_buf_len);
+                        if (ret)
+                                return ret;
+                }
+        }
+        *proto = ac->protocol;
+        *buf = s->s_authorizer_buf;
+        *len = s->s_authorizer_buf_len;
+        *reply_buf = s->s_authorizer_reply_buf;
+        *reply_len = s->s_authorizer_reply_buf_len;
+        return 0;
+}
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+        struct ceph_mds_session *s = con->private;
+        struct ceph_mds_client *mdsc = s->s_mdsc;
+        struct ceph_auth_client *ac = mdsc->client->monc.auth;
+        return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
+}
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+        struct ceph_mds_session *s = con->private;
+        struct ceph_mds_client *mdsc = s->s_mdsc;
+        struct ceph_auth_client *ac = mdsc->client->monc.auth;
+        if (ac->ops->invalidate_authorizer)
+                ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
+        return ceph_monc_validate_auth(&mdsc->client->monc);
+}
+const static struct ceph_connection_operations mds_con_ops = {
+        .get = con_get,
+        .put = con_put,
+        .dispatch = dispatch,
+        .get_authorizer = get_authorizer,
+        .verify_authorizer_reply = verify_authorizer_reply,
+        .invalidate_authorizer = invalidate_authorizer,
+        .peer_reset = peer_reset,
+};
+/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..961cc6f65878
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
+#ifndef _FS_CEPH_MDS_CLIENT_H
+#define _FS_CEPH_MDS_CLIENT_H
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include "types.h"
+#include "messenger.h"
+#include "mdsmap.h"
+/*
+ * Some lock dependencies:
+ *
+ * session->s_mutex
+ *         mdsc->mutex
+ *
+ *         mdsc->snap_rwsem
+ *
+ *         inode->i_lock
+ *                 mdsc->snap_flush_lock
+ *                 mdsc->cap_delay_lock
+ *
+ */
+struct ceph_client;
+struct ceph_cap;
+/*
+ * parsed info about a single inode.  pointers are into the encoded
+ * on-wire structures within the mds reply message payload.
+ */
+struct ceph_mds_reply_info_in {
+        struct ceph_mds_reply_inode *in;
+        u32 symlink_len;
+        char *symlink;
+        u32 xattr_len;
+        char *xattr_data;
+};
+/*
+ * parsed info about an mds reply, including information about the
+ * target inode and/or its parent directory and dentry, and directory
+ * contents (for readdir results).
+ */
+struct ceph_mds_reply_info_parsed {
+        struct ceph_mds_reply_head    *head;
+        struct ceph_mds_reply_info_in diri, targeti;
+        struct ceph_mds_reply_dirfrag *dirfrag;
+        char                          *dname;
+        u32                           dname_len;
+        struct ceph_mds_reply_lease   *dlease;
+        struct ceph_mds_reply_dirfrag *dir_dir;
+        int                           dir_nr;
+        char                          **dir_dname;
+        u32                           *dir_dname_len;
+        struct ceph_mds_reply_lease   **dir_dlease;
+        struct ceph_mds_reply_info_in *dir_in;
+        u8                            dir_complete, dir_end;
+        /* encoded blob describing snapshot contexts for certain
+           operations (e.g., open) */
+        void *snapblob;
+        int snapblob_len;
+};
+/*
+ * cap releases are batched and sent to the MDS en masse.
+ */
+#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE -                       \
+                                sizeof(struct ceph_mds_cap_release)) /  \
+                               sizeof(struct ceph_mds_cap_item))
+/*
+ * state associated with each MDS<->client session
+ */
+enum {
+        CEPH_MDS_SESSION_NEW = 1,
+        CEPH_MDS_SESSION_OPENING = 2,
+        CEPH_MDS_SESSION_OPEN = 3,
+        CEPH_MDS_SESSION_HUNG = 4,
+        CEPH_MDS_SESSION_CLOSING = 5,
+        CEPH_MDS_SESSION_RESTARTING = 6,
+        CEPH_MDS_SESSION_RECONNECTING = 7,
+};
+struct ceph_mds_session {
+        struct ceph_mds_client *s_mdsc;
+        int               s_mds;
+        int               s_state;
+        unsigned long     s_ttl;      /* time until mds kills us */
+        u64               s_seq;      /* incoming msg seq # */
+        struct mutex      s_mutex;    /* serialize session messages */
+        struct ceph_connection s_con;
+        struct ceph_authorizer *s_authorizer;
+        void             *s_authorizer_buf, *s_authorizer_reply_buf;
+        size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
+        /* protected by s_cap_lock */
+        spinlock_t        s_cap_lock;
+        u32               s_cap_gen;  /* inc each time we get mds stale msg */
+        unsigned long     s_cap_ttl;  /* when session caps expire */
+        struct list_head  s_caps;     /* all caps issued by this session */
+        int               s_nr_caps, s_trim_caps;
+        int               s_num_cap_releases;
+        struct list_head  s_cap_releases; /* waiting cap_release messages */
+        struct list_head  s_cap_releases_done; /* ready to send */
+        struct ceph_cap  *s_cap_iterator;
+        /* protected by mutex */
+        struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
+        struct list_head  s_cap_snaps_flushing;
+        unsigned long     s_renew_requested; /* last time we sent a renew req */
+        u64               s_renew_seq;
+        atomic_t          s_ref;
+        struct list_head  s_waiting;  /* waiting requests */
+        struct list_head  s_unsafe;   /* unsafe requests */
+};
+/*
+ * modes of choosing which MDS to send a request to
+ */
+enum {
+        USE_ANY_MDS,
+        USE_RANDOM_MDS,
+        USE_AUTH_MDS,   /* prefer authoritative mds for this metadata item */
+};
+struct ceph_mds_request;
+struct ceph_mds_client;
+/*
+ * request completion callback
+ */
+typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
+                                             struct ceph_mds_request *req);
+/*
+ * an in-flight mds request
+ */
+struct ceph_mds_request {
+        u64 r_tid;                   /* transaction id */
+        struct rb_node r_node;
+        int r_op;                    /* mds op code */
+        int r_mds;
+        /* operation on what? */
+        struct inode *r_inode;              /* arg1 */
+        struct dentry *r_dentry;            /* arg1 */
+        struct dentry *r_old_dentry;        /* arg2: rename from or link from */
+        char *r_path1, *r_path2;
+        struct ceph_vino r_ino1, r_ino2;
+        struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+        struct inode *r_target_inode;       /* resulting inode */
+        union ceph_mds_request_args r_args;
+        int r_fmode;        /* file mode, if expecting cap */
+        /* for choosing which mds to send this request to */
+        int r_direct_mode;
+        u32 r_direct_hash;      /* choose dir frag based on this dentry hash */
+        bool r_direct_is_hash;  /* true if r_direct_hash is valid */
+        /* data payload is used for xattr ops */
+        struct page **r_pages;
+        int r_num_pages;
+        int r_data_len;
+        /* what caps shall we drop? */
+        int r_inode_drop, r_inode_unless;
+        int r_dentry_drop, r_dentry_unless;
+        int r_old_dentry_drop, r_old_dentry_unless;
+        struct inode *r_old_inode;
+        int r_old_inode_drop, r_old_inode_unless;
+        struct ceph_msg  *r_request;  /* original request */
+        struct ceph_msg  *r_reply;
+        struct ceph_mds_reply_info_parsed r_reply_info;
+        int r_err;
+        bool r_aborted;
+        unsigned long r_timeout;  /* optional.  jiffies */
+        unsigned long r_started;  /* start time to measure timeout against */
+        unsigned long r_request_started; /* start time for mds request only,
+                                            used to measure lease durations */
+        /* link unsafe requests to parent directory, for fsync */
+        struct inode    *r_unsafe_dir;
+        struct list_head r_unsafe_dir_item;
+        struct ceph_mds_session *r_session;
+        int               r_attempts;   /* resend attempts */
+        int               r_num_fwd;    /* number of forward attempts */
+        int               r_num_stale;
+        int               r_resend_mds; /* mds to resend to next, if any*/
+        struct kref       r_kref;
+        struct list_head  r_wait;
+        struct completion r_completion;
+        struct completion r_safe_completion;
+        ceph_mds_request_callback_t r_callback;
+        struct list_head  r_unsafe_item;  /* per-session unsafe list item */
+        bool              r_got_unsafe, r_got_safe;
+        bool              r_did_prepopulate;
+        u32               r_readdir_offset;
+        struct ceph_cap_reservation r_caps_reservation;
+        int r_num_caps;
+};
+/*
+ * mds client state
+ */
+struct ceph_mds_client {
+        struct ceph_client      *client;
+        struct mutex            mutex;         /* all nested structures */
+        struct ceph_mdsmap      *mdsmap;
+        struct completion       safe_umount_waiters, session_close_waiters;
+        struct list_head        waiting_for_map;
+        struct ceph_mds_session **sessions;    /* NULL for mds if no session */
+        int                     max_sessions;  /* len of s_mds_sessions */
+        int                     stopping;      /* true if shutting down */
+        /*
+         * snap_rwsem will cover cap linkage into snaprealms, and
+         * realm snap contexts.  (later, we can do per-realm snap
+         * contexts locks..)  the empty list contains realms with no
+         * references (implying they contain no inodes with caps) that
+         * should be destroyed.
+         */
+        struct rw_semaphore     snap_rwsem;
+        struct rb_root          snap_realms;
+        struct list_head        snap_empty;
+        spinlock_t              snap_empty_lock;  /* protect snap_empty */
+        u64                    last_tid;      /* most recent mds request */
+        struct rb_root         request_tree;  /* pending mds requests */
+        struct delayed_work    delayed_work;  /* delayed work */
+        unsigned long    last_renew_caps;  /* last time we renewed our caps */
+        struct list_head cap_delay_list;   /* caps with delayed release */
+        spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
+        struct list_head snap_flush_list;  /* cap_snaps ready to flush */
+        spinlock_t       snap_flush_lock;
+        u64               cap_flush_seq;
+        struct list_head  cap_dirty;        /* inodes with dirty caps */
+        int               num_cap_flushing; /* # caps we are flushing */
+        spinlock_t        cap_dirty_lock;   /* protects above items */
+        wait_queue_head_t cap_flushing_wq;
+#ifdef CONFIG_DEBUG_FS
+        struct dentry     *debugfs_file;
+#endif
+        spinlock_t        dentry_lru_lock;
+        struct list_head  dentry_lru;
+        int               num_dentry;
+};
+extern const char *ceph_mds_op_name(int op);
+extern struct ceph_mds_session *
+__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
+static inline struct ceph_mds_session *
+ceph_get_mds_session(struct ceph_mds_session *s)
+{
+        atomic_inc(&s->s_ref);
+        return s;
+}
+extern void ceph_put_mds_session(struct ceph_mds_session *s);
+extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
+                             struct ceph_msg *msg, int mds);
+extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
+                           struct ceph_client *client);
+extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
+                                    struct inode *inode,
+                                    struct dentry *dn, int mask);
+extern struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
+extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+                                     struct ceph_mds_request *req);
+extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+                                struct inode *dir,
+                                struct ceph_mds_request *req);
+static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
+{
+        kref_get(&req->r_kref);
+}
+extern void ceph_mdsc_release_request(struct kref *kref);
+static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
+{
+        kref_put(&req->r_kref, ceph_mdsc_release_request);
+}
+extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
+extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+                                  int stop_on_nosnap);
+extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
+extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+                                     struct inode *inode,
+                                     struct dentry *dentry, char action,
+                                     u32 seq);
+extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
+                                 struct ceph_msg *msg);
+#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..c4c498e6dfef
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
+#include "ceph_debug.h"
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include "mdsmap.h"
+#include "messenger.h"
+#include "decode.h"
+#include "super.h"
+/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+        int n = 0;
+        int i;
+        char r;
+        /* count */
+        for (i = 0; i < m->m_max_mds; i++)
+                if (m->m_info[i].state > 0)
+                        n++;
+        if (n == 0)
+                return -1;
+        /* pick */
+        get_random_bytes(&r, 1);
+        n = r % n;
+        i = 0;
+        for (i = 0; n > 0; i++, n--)
+                while (m->m_info[i].state <= 0)
+                        i++;
+        return i;
+}
+/*
+ * Decode an MDS map
+ *
+ * Ignore any fields we don't care about (there are quite a few of
+ * them).
+ */
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+{
+        struct ceph_mdsmap *m;
+        const void *start = *p;
+        int i, j, n;
+        int err = -EINVAL;
+        u16 version;
+        m = kzalloc(sizeof(*m), GFP_NOFS);
+        if (m == NULL)
+                return ERR_PTR(-ENOMEM);
+        ceph_decode_16_safe(p, end, version, bad);
+        ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
+        m->m_epoch = ceph_decode_32(p);
+        m->m_client_epoch = ceph_decode_32(p);
+        m->m_last_failure = ceph_decode_32(p);
+        m->m_root = ceph_decode_32(p);
+        m->m_session_timeout = ceph_decode_32(p);
+        m->m_session_autoclose = ceph_decode_32(p);
+        m->m_max_file_size = ceph_decode_64(p);
+        m->m_max_mds = ceph_decode_32(p);
+        m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
+        if (m->m_info == NULL)
+                goto badmem;
+        /* pick out active nodes from mds_info (state > 0) */
+        n = ceph_decode_32(p);
+        for (i = 0; i < n; i++) {
+                u64 global_id;
+                u32 namelen;
+                s32 mds, inc, state;
+                u64 state_seq;
+                u8 infoversion;
+                struct ceph_entity_addr addr;
+                u32 num_export_targets;
+                void *pexport_targets = NULL;
+                ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+                global_id = ceph_decode_64(p);
+                infoversion = ceph_decode_8(p);
+                *p += sizeof(u64);
+                namelen = ceph_decode_32(p);  /* skip mds name */
+                *p += namelen;
+                ceph_decode_need(p, end,
+                                 4*sizeof(u32) + sizeof(u64) +
+                                 sizeof(addr) + sizeof(struct ceph_timespec),
+                                 bad);
+                mds = ceph_decode_32(p);
+                inc = ceph_decode_32(p);
+                state = ceph_decode_32(p);
+                state_seq = ceph_decode_64(p);
+                ceph_decode_copy(p, &addr, sizeof(addr));
+                ceph_decode_addr(&addr);
+                *p += sizeof(struct ceph_timespec);
+                *p += sizeof(u32);
+                ceph_decode_32_safe(p, end, namelen, bad);
+                *p += namelen;
+                if (infoversion >= 2) {
+                        ceph_decode_32_safe(p, end, num_export_targets, bad);
+                        pexport_targets = *p;
+                        *p += num_export_targets * sizeof(u32);
+                } else {
+                        num_export_targets = 0;
+                }
+                dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
+                     i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+                     ceph_mds_state_name(state));
+                if (mds >= 0 && mds < m->m_max_mds && state > 0) {
+                        m->m_info[mds].global_id = global_id;
+                        m->m_info[mds].state = state;
+                        m->m_info[mds].addr = addr;
+                        m->m_info[mds].num_export_targets = num_export_targets;
+                        if (num_export_targets) {
+                                m->m_info[mds].export_targets =
+                                        kcalloc(num_export_targets, sizeof(u32),
+                                                GFP_NOFS);
+                                for (j = 0; j < num_export_targets; j++)
+                                        m->m_info[mds].export_targets[j] =
+                                               ceph_decode_32(&pexport_targets);
+                        } else {
+                                m->m_info[mds].export_targets = NULL;
+                        }
+                }
+        }
+        /* pg_pools */
+        ceph_decode_32_safe(p, end, n, bad);
+        m->m_num_data_pg_pools = n;
+        m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+        if (!m->m_data_pg_pools)
+                goto badmem;
+        ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+        for (i = 0; i < n; i++)
+                m->m_data_pg_pools[i] = ceph_decode_32(p);
+        m->m_cas_pg_pool = ceph_decode_32(p);
+        /* ok, we don't care about the rest. */
+        dout("mdsmap_decode success epoch %u\n", m->m_epoch);
+        return m;
+badmem:
+        err = -ENOMEM;
+bad:
+        pr_err("corrupt mdsmap\n");
+        print_hex_dump(KERN_DEBUG, "mdsmap: ",
+                       DUMP_PREFIX_OFFSET, 16, 1,
+                       start, end - start, true);
+        ceph_mdsmap_destroy(m);
+        return ERR_PTR(-EINVAL);
+}
+void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
+{
+        int i;
+        for (i = 0; i < m->m_max_mds; i++)
+                kfree(m->m_info[i].export_targets);
+        kfree(m->m_info);
+        kfree(m->m_data_pg_pools);
+        kfree(m);
+}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+#include "types.h"
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+        u64 global_id;
+        struct ceph_entity_addr addr;
+        s32 state;
+        int num_export_targets;
+        u32 *export_targets;
+};
+struct ceph_mdsmap {
+        u32 m_epoch, m_client_epoch, m_last_failure;
+        u32 m_root;
+        u32 m_session_timeout;          /* seconds */
+        u32 m_session_autoclose;        /* seconds */
+        u64 m_max_file_size;
+        u32 m_max_mds;                  /* size of m_addr, m_state arrays */
+        struct ceph_mds_info *m_info;
+        /* which object pools file data can be stored in */
+        int m_num_data_pg_pools;
+        u32 *m_data_pg_pools;
+        u32 m_cas_pg_pool;
+};
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+        if (w >= m->m_max_mds)
+                return NULL;
+        return &m->m_info[w].addr;
+}
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+        BUG_ON(w < 0);
+        if (w >= m->m_max_mds)
+                return CEPH_MDS_STATE_DNE;
+        return m->m_info[w].state;
+}
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..cd4fadb6491a
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2284 @@
+#include "ceph_debug.h"
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <net/tcp.h>
+#include "super.h"
+#include "messenger.h"
+#include "decode.h"
+#include "pagelist.h"
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system.  The messenger provides ordered and reliable
+ * delivery.  We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error).  Acks allow sent messages to be discarded by
+ * the sender.
+ */
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key socket_class;
+#endif
+static void queue_con(struct ceph_connection *con);
+static void con_work(struct work_struct *);
+static void ceph_fault(struct ceph_connection *con);
+const char *ceph_name_type_str(int t)
+{
+        switch (t) {
+        case CEPH_ENTITY_TYPE_MON: return "mon";
+        case CEPH_ENTITY_TYPE_MDS: return "mds";
+        case CEPH_ENTITY_TYPE_OSD: return "osd";
+        case CEPH_ENTITY_TYPE_CLIENT: return "client";
+        case CEPH_ENTITY_TYPE_ADMIN: return "admin";
+        default: return "???";
+        }
+}
+/*
+ * nicely render a sockaddr as a string.
+ */
+#define MAX_ADDR_STR 20
+static char addr_str[MAX_ADDR_STR][40];
+static DEFINE_SPINLOCK(addr_str_lock);
+static int last_addr_str;
+const char *pr_addr(const struct sockaddr_storage *ss)
+{
+        int i;
+        char *s;
+        struct sockaddr_in *in4 = (void *)ss;
+        unsigned char *quad = (void *)&in4->sin_addr.s_addr;
+        struct sockaddr_in6 *in6 = (void *)ss;
+        spin_lock(&addr_str_lock);
+        i = last_addr_str++;
+        if (last_addr_str == MAX_ADDR_STR)
+                last_addr_str = 0;
+        spin_unlock(&addr_str_lock);
+        s = addr_str[i];
+        switch (ss->ss_family) {
+        case AF_INET:
+                sprintf(s, "%u.%u.%u.%u:%u",
+                        (unsigned int)quad[0],
+                        (unsigned int)quad[1],
+                        (unsigned int)quad[2],
+                        (unsigned int)quad[3],
+                        (unsigned int)ntohs(in4->sin_port));
+                break;
+        case AF_INET6:
+                sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
+                        in6->sin6_addr.s6_addr16[0],
+                        in6->sin6_addr.s6_addr16[1],
+                        in6->sin6_addr.s6_addr16[2],
+                        in6->sin6_addr.s6_addr16[3],
+                        in6->sin6_addr.s6_addr16[4],
+                        in6->sin6_addr.s6_addr16[5],
+                        in6->sin6_addr.s6_addr16[6],
+                        in6->sin6_addr.s6_addr16[7],
+                        (unsigned int)ntohs(in6->sin6_port));
+                break;
+        default:
+                sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
+        }
+        return s;
+}
+static void encode_my_addr(struct ceph_messenger *msgr)
+{
+        memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
+        ceph_encode_addr(&msgr->my_enc_addr);
+}
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+struct workqueue_struct *ceph_msgr_wq;
+int __init ceph_msgr_init(void)
+{
+        ceph_msgr_wq = create_workqueue("ceph-msgr");
+        if (IS_ERR(ceph_msgr_wq)) {
+                int ret = PTR_ERR(ceph_msgr_wq);
+                pr_err("msgr_init failed to create workqueue: %d\n", ret);
+                ceph_msgr_wq = NULL;
+                return ret;
+        }
+        return 0;
+}
+void ceph_msgr_exit(void)
+{
+        destroy_workqueue(ceph_msgr_wq);
+}
+/*
+ * socket callback functions
+ */
+/* data available on socket, or listen socket received a connect */
+static void ceph_data_ready(struct sock *sk, int count_unused)
+{
+        struct ceph_connection *con =
+                (struct ceph_connection *)sk->sk_user_data;
+        if (sk->sk_state != TCP_CLOSE_WAIT) {
+                dout("ceph_data_ready on %p state = %lu, queueing work\n",
+                     con, con->state);
+                queue_con(con);
+        }
+}
+/* socket has buffer space for writing */
+static void ceph_write_space(struct sock *sk)
+{
+        struct ceph_connection *con =
+                (struct ceph_connection *)sk->sk_user_data;
+        /* only queue to workqueue if there is data we want to write. */
+        if (test_bit(WRITE_PENDING, &con->state)) {
+                dout("ceph_write_space %p queueing write work\n", con);
+                queue_con(con);
+        } else {
+                dout("ceph_write_space %p nothing to write\n", con);
+        }
+        /* since we have our own write_space, clear the SOCK_NOSPACE flag */
+        clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
+/* socket's state has changed */
+static void ceph_state_change(struct sock *sk)
+{
+        struct ceph_connection *con =
+                (struct ceph_connection *)sk->sk_user_data;
+        dout("ceph_state_change %p state = %lu sk_state = %u\n",
+             con, con->state, sk->sk_state);
+        if (test_bit(CLOSED, &con->state))
+                return;
+        switch (sk->sk_state) {
+        case TCP_CLOSE:
+                dout("ceph_state_change TCP_CLOSE\n");
+        case TCP_CLOSE_WAIT:
+                dout("ceph_state_change TCP_CLOSE_WAIT\n");
+                if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
+                        if (test_bit(CONNECTING, &con->state))
+                                con->error_msg = "connection failed";
+                        else
+                                con->error_msg = "socket closed";
+                        queue_con(con);
+                }
+                break;
+        case TCP_ESTABLISHED:
+                dout("ceph_state_change TCP_ESTABLISHED\n");
+                queue_con(con);
+                break;
+        }
+}
+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,
+                               struct ceph_connection *con)
+{
+        struct sock *sk = sock->sk;
+        sk->sk_user_data = (void *)con;
+        sk->sk_data_ready = ceph_data_ready;
+        sk->sk_write_space = ceph_write_space;
+        sk->sk_state_change = ceph_state_change;
+}
+/*
+ * socket helpers
+ */
+/*
+ * initiate connection to a remote socket.
+ */
+static struct socket *ceph_tcp_connect(struct ceph_connection *con)
+{
+        struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
+        struct socket *sock;
+        int ret;
+        BUG_ON(con->sock);
+        ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+        if (ret)
+                return ERR_PTR(ret);
+        con->sock = sock;
+        sock->sk->sk_allocation = GFP_NOFS;
+#ifdef CONFIG_LOCKDEP
+        lockdep_set_class(&sock->sk->sk_lock, &socket_class);
+#endif
+        set_sock_callbacks(sock, con);
+        dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
+        ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
+        if (ret == -EINPROGRESS) {
+                dout("connect %s EINPROGRESS sk_state = %u\n",
+                     pr_addr(&con->peer_addr.in_addr),
+                     sock->sk->sk_state);
+                ret = 0;
+        }
+        if (ret < 0) {
+                pr_err("connect %s error %d\n",
+                       pr_addr(&con->peer_addr.in_addr), ret);
+                sock_release(sock);
+                con->sock = NULL;
+                con->error_msg = "connect error";
+        }
+        if (ret < 0)
+                return ERR_PTR(ret);
+        return sock;
+}
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+        struct kvec iov = {buf, len};
+        struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+        return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+}
+/*
+ * write something.  @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+                     size_t kvlen, size_t len, int more)
+{
+        struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+        if (more)
+                msg.msg_flags |= MSG_MORE;
+        else
+                msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
+        return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+}
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+static int con_close_socket(struct ceph_connection *con)
+{
+        int rc;
+        dout("con_close_socket on %p sock %p\n", con, con->sock);
+        if (!con->sock)
+                return 0;
+        set_bit(SOCK_CLOSED, &con->state);
+        rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+        sock_release(con->sock);
+        con->sock = NULL;
+        clear_bit(SOCK_CLOSED, &con->state);
+        return rc;
+}
+/*
+ * Reset a connection.  Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+        list_del_init(&msg->list_head);
+        ceph_msg_put(msg);
+}
+static void ceph_msg_remove_list(struct list_head *head)
+{
+        while (!list_empty(head)) {
+                struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+                                                        list_head);
+                ceph_msg_remove(msg);
+        }
+}
+static void reset_connection(struct ceph_connection *con)
+{
+        /* reset connection, out_queue, msg_ and connect_seq */
+        /* discard existing out_queue and msg_seq */
+        ceph_msg_remove_list(&con->out_queue);
+        ceph_msg_remove_list(&con->out_sent);
+        if (con->in_msg) {
+                ceph_msg_put(con->in_msg);
+                con->in_msg = NULL;
+        }
+        con->connect_seq = 0;
+        con->out_seq = 0;
+        if (con->out_msg) {
+                ceph_msg_put(con->out_msg);
+                con->out_msg = NULL;
+        }
+        con->in_seq = 0;
+        con->in_seq_acked = 0;
+}
+/*
+ * mark a peer down.  drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+        dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
+        set_bit(CLOSED, &con->state);  /* in case there's queued work */
+        clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
+        clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
+        clear_bit(KEEPALIVE_PENDING, &con->state);
+        clear_bit(WRITE_PENDING, &con->state);
+        mutex_lock(&con->mutex);
+        reset_connection(con);
+        cancel_delayed_work(&con->work);
+        mutex_unlock(&con->mutex);
+        queue_con(con);
+}
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+{
+        dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
+        set_bit(OPENING, &con->state);
+        clear_bit(CLOSED, &con->state);
+        memcpy(&con->peer_addr, addr, sizeof(*addr));
+        con->delay = 0;      /* reset backoff memory */
+        queue_con(con);
+}
+/*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+        return con->connect_seq > 0;
+}
+/*
+ * generic get/put
+ */
+struct ceph_connection *ceph_con_get(struct ceph_connection *con)
+{
+        dout("con_get %p nref = %d -> %d\n", con,
+             atomic_read(&con->nref), atomic_read(&con->nref) + 1);
+        if (atomic_inc_not_zero(&con->nref))
+                return con;
+        return NULL;
+}
+void ceph_con_put(struct ceph_connection *con)
+{
+        dout("con_put %p nref = %d -> %d\n", con,
+             atomic_read(&con->nref), atomic_read(&con->nref) - 1);
+        BUG_ON(atomic_read(&con->nref) == 0);
+        if (atomic_dec_and_test(&con->nref)) {
+                BUG_ON(con->sock);
+                kfree(con);
+        }
+}
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+{
+        dout("con_init %p\n", con);
+        memset(con, 0, sizeof(*con));
+        atomic_set(&con->nref, 1);
+        con->msgr = msgr;
+        mutex_init(&con->mutex);
+        INIT_LIST_HEAD(&con->out_queue);
+        INIT_LIST_HEAD(&con->out_sent);
+        INIT_DELAYED_WORK(&con->work, con_work);
+}
+/*
+ * We maintain a global counter to order connection attempts.  Get
+ * a unique seq greater than @gt.
+ */
+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+        u32 ret;
+        spin_lock(&msgr->global_seq_lock);
+        if (msgr->global_seq < gt)
+                msgr->global_seq = gt;
+        ret = ++msgr->global_seq;
+        spin_unlock(&msgr->global_seq_lock);
+        return ret;
+}
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con, int v)
+{
+        struct ceph_msg *m = con->out_msg;
+        dout("prepare_write_message_footer %p\n", con);
+        con->out_kvec_is_msg = true;
+        con->out_kvec[v].iov_base = &m->footer;
+        con->out_kvec[v].iov_len = sizeof(m->footer);
+        con->out_kvec_bytes += sizeof(m->footer);
+        con->out_kvec_left++;
+        con->out_more = m->more_to_follow;
+        con->out_msg_done = true;
+}
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+        struct ceph_msg *m;
+        int v = 0;
+        con->out_kvec_bytes = 0;
+        con->out_kvec_is_msg = true;
+        con->out_msg_done = false;
+        /* Sneak an ack in there first?  If we can get it into the same
+         * TCP packet that's a good thing. */
+        if (con->in_seq > con->in_seq_acked) {
+                con->in_seq_acked = con->in_seq;
+                con->out_kvec[v].iov_base = &tag_ack;
+                con->out_kvec[v++].iov_len = 1;
+                con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+                con->out_kvec[v].iov_base = &con->out_temp_ack;
+                con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
+                con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+        }
+        m = list_first_entry(&con->out_queue,
+                       struct ceph_msg, list_head);
+        con->out_msg = m;
+        if (test_bit(LOSSYTX, &con->state)) {
+                list_del_init(&m->list_head);
+        } else {
+                /* put message on sent list */
+                ceph_msg_get(m);
+                list_move_tail(&m->list_head, &con->out_sent);
+        }
+        /*
+         * only assign outgoing seq # if we haven't sent this message
+         * yet.  if it is requeued, resend with it's original seq.
+         */
+        if (m->needs_out_seq) {
+                m->hdr.seq = cpu_to_le64(++con->out_seq);
+                m->needs_out_seq = false;
+        }
+        dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+             m, con->out_seq, le16_to_cpu(m->hdr.type),
+             le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+             le32_to_cpu(m->hdr.data_len),
+             m->nr_pages);
+        BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+        /* tag + hdr + front + middle */
+        con->out_kvec[v].iov_base = &tag_msg;
+        con->out_kvec[v++].iov_len = 1;
+        con->out_kvec[v].iov_base = &m->hdr;
+        con->out_kvec[v++].iov_len = sizeof(m->hdr);
+        con->out_kvec[v++] = m->front;
+        if (m->middle)
+                con->out_kvec[v++] = m->middle->vec;
+        con->out_kvec_left = v;
+        con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
+                (m->middle ? m->middle->vec.iov_len : 0);
+        con->out_kvec_cur = con->out_kvec;
+        /* fill in crc (except data pages), footer */
+        con->out_msg->hdr.crc =
+                cpu_to_le32(crc32c(0, (void *)&m->hdr,
+                                      sizeof(m->hdr) - sizeof(m->hdr.crc)));
+        con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+        con->out_msg->footer.front_crc =
+                cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
+        if (m->middle)
+                con->out_msg->footer.middle_crc =
+                        cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
+                                           m->middle->vec.iov_len));
+        else
+                con->out_msg->footer.middle_crc = 0;
+        con->out_msg->footer.data_crc = 0;
+        dout("prepare_write_message front_crc %u data_crc %u\n",
+             le32_to_cpu(con->out_msg->footer.front_crc),
+             le32_to_cpu(con->out_msg->footer.middle_crc));
+        /* is there a data payload? */
+        if (le32_to_cpu(m->hdr.data_len) > 0) {
+                /* initialize page iterator */
+                con->out_msg_pos.page = 0;
+                con->out_msg_pos.page_pos =
+                        le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+                con->out_msg_pos.data_pos = 0;
+                con->out_msg_pos.did_page_crc = 0;
+                con->out_more = 1;  /* data + footer will follow */
+        } else {
+                /* no, queue up footer too and be done */
+                prepare_write_message_footer(con, v);
+        }
+        set_bit(WRITE_PENDING, &con->state);
+}
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+        dout("prepare_write_ack %p %llu -> %llu\n", con,
+             con->in_seq_acked, con->in_seq);
+        con->in_seq_acked = con->in_seq;
+        con->out_kvec[0].iov_base = &tag_ack;
+        con->out_kvec[0].iov_len = 1;
+        con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+        con->out_kvec[1].iov_base = &con->out_temp_ack;
+        con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
+        con->out_kvec_left = 2;
+        con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+        con->out_kvec_cur = con->out_kvec;
+        con->out_more = 1;  /* more will follow.. eventually.. */
+        set_bit(WRITE_PENDING, &con->state);
+}
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+        dout("prepare_write_keepalive %p\n", con);
+        con->out_kvec[0].iov_base = &tag_keepalive;
+        con->out_kvec[0].iov_len = 1;
+        con->out_kvec_left = 1;
+        con->out_kvec_bytes = 1;
+        con->out_kvec_cur = con->out_kvec;
+        set_bit(WRITE_PENDING, &con->state);
+}
+/*
+ * Connection negotiation.
+ */
+static void prepare_connect_authorizer(struct ceph_connection *con)
+{
+        void *auth_buf;
+        int auth_len = 0;
+        int auth_protocol = 0;
+        mutex_unlock(&con->mutex);
+        if (con->ops->get_authorizer)
+                con->ops->get_authorizer(con, &auth_buf, &auth_len,
+                                         &auth_protocol, &con->auth_reply_buf,
+                                         &con->auth_reply_buf_len,
+                                         con->auth_retry);
+        mutex_lock(&con->mutex);
+        con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
+        con->out_connect.authorizer_len = cpu_to_le32(auth_len);
+        con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
+        con->out_kvec[con->out_kvec_left].iov_len = auth_len;
+        con->out_kvec_left++;
+        con->out_kvec_bytes += auth_len;
+}
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_messenger *msgr,
+                                 struct ceph_connection *con)
+{
+        int len = strlen(CEPH_BANNER);
+        con->out_kvec[0].iov_base = CEPH_BANNER;
+        con->out_kvec[0].iov_len = len;
+        con->out_kvec[1].iov_base = &msgr->my_enc_addr;
+        con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
+        con->out_kvec_left = 2;
+        con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
+        con->out_kvec_cur = con->out_kvec;
+        con->out_more = 0;
+        set_bit(WRITE_PENDING, &con->state);
+}
+static void prepare_write_connect(struct ceph_messenger *msgr,
+                                  struct ceph_connection *con,
+                                  int after_banner)
+{
+        unsigned global_seq = get_global_seq(con->msgr, 0);
+        int proto;
+        switch (con->peer_name.type) {
+        case CEPH_ENTITY_TYPE_MON:
+                proto = CEPH_MONC_PROTOCOL;
+                break;
+        case CEPH_ENTITY_TYPE_OSD:
+                proto = CEPH_OSDC_PROTOCOL;
+                break;
+        case CEPH_ENTITY_TYPE_MDS:
+                proto = CEPH_MDSC_PROTOCOL;
+                break;
+        default:
+                BUG();
+        }
+        dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+             con->connect_seq, global_seq, proto);
+        con->out_connect.features = CEPH_FEATURE_SUPPORTED;
+        con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+        con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+        con->out_connect.global_seq = cpu_to_le32(global_seq);
+        con->out_connect.protocol_version = cpu_to_le32(proto);
+        con->out_connect.flags = 0;
+        if (!after_banner) {
+                con->out_kvec_left = 0;
+                con->out_kvec_bytes = 0;
+        }
+        con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
+        con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
+        con->out_kvec_left++;
+        con->out_kvec_bytes += sizeof(con->out_connect);
+        con->out_kvec_cur = con->out_kvec;
+        con->out_more = 0;
+        set_bit(WRITE_PENDING, &con->state);
+        prepare_connect_authorizer(con);
+}
+/*
+ * write as much of pending kvecs to the socket as we can.
+ *  1 -> done
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+        int ret;
+        dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+        while (con->out_kvec_bytes > 0) {
+                ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+                                       con->out_kvec_left, con->out_kvec_bytes,
+                                       con->out_more);
+                if (ret <= 0)
+                        goto out;
+                con->out_kvec_bytes -= ret;
+                if (con->out_kvec_bytes == 0)
+                        break;            /* done */
+                while (ret > 0) {
+                        if (ret >= con->out_kvec_cur->iov_len) {
+                                ret -= con->out_kvec_cur->iov_len;
+                                con->out_kvec_cur++;
+                                con->out_kvec_left--;
+                        } else {
+                                con->out_kvec_cur->iov_len -= ret;
+                                con->out_kvec_cur->iov_base += ret;
+                                ret = 0;
+                                break;
+                        }
+                }
+        }
+        con->out_kvec_left = 0;
+        con->out_kvec_is_msg = false;
+        ret = 1;
+out:
+        dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+             con->out_kvec_bytes, con->out_kvec_left, ret);
+        return ret;  /* done! */
+}
+/*
+ * Write as much message data payload as we can.  If we finish, queue
+ * up the footer.
+ *  1 -> done, footer is now queued in out_kvec[].
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_msg_pages(struct ceph_connection *con)
+{
+        struct ceph_msg *msg = con->out_msg;
+        unsigned data_len = le32_to_cpu(msg->hdr.data_len);
+        size_t len;
+        int crc = con->msgr->nocrc;
+        int ret;
+        dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
+             con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+             con->out_msg_pos.page_pos);
+        while (con->out_msg_pos.page < con->out_msg->nr_pages) {
+                struct page *page = NULL;
+                void *kaddr = NULL;
+                /*
+                 * if we are calculating the data crc (the default), we need
+                 * to map the page.  if our pages[] has been revoked, use the
+                 * zero page.
+                 */
+                if (msg->pages) {
+                        page = msg->pages[con->out_msg_pos.page];
+                        if (crc)
+                                kaddr = kmap(page);
+                } else if (msg->pagelist) {
+                        page = list_first_entry(&msg->pagelist->head,
+                                                struct page, lru);
+                        if (crc)
+                                kaddr = kmap(page);
+                } else {
+                        page = con->msgr->zero_page;
+                        if (crc)
+                                kaddr = page_address(con->msgr->zero_page);
+                }
+                len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
+                          (int)(data_len - con->out_msg_pos.data_pos));
+                if (crc && !con->out_msg_pos.did_page_crc) {
+                        void *base = kaddr + con->out_msg_pos.page_pos;
+                        u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+                        BUG_ON(kaddr == NULL);
+                        con->out_msg->footer.data_crc =
+                                cpu_to_le32(crc32c(tmpcrc, base, len));
+                        con->out_msg_pos.did_page_crc = 1;
+                }
+                ret = kernel_sendpage(con->sock, page,
+                                      con->out_msg_pos.page_pos, len,
+                                      MSG_DONTWAIT | MSG_NOSIGNAL |
+                                      MSG_MORE);
+                if (crc && (msg->pages || msg->pagelist))
+                        kunmap(page);
+                if (ret <= 0)
+                        goto out;
+                con->out_msg_pos.data_pos += ret;
+                con->out_msg_pos.page_pos += ret;
+                if (ret == len) {
+                        con->out_msg_pos.page_pos = 0;
+                        con->out_msg_pos.page++;
+                        con->out_msg_pos.did_page_crc = 0;
+                        if (msg->pagelist)
+                                list_move_tail(&page->lru,
+                                               &msg->pagelist->head);
+                }
+        }
+        dout("write_partial_msg_pages %p msg %p done\n", con, msg);
+        /* prepare and queue up footer, too */
+        if (!crc)
+                con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+        con->out_kvec_bytes = 0;
+        con->out_kvec_left = 0;
+        con->out_kvec_cur = con->out_kvec;
+        prepare_write_message_footer(con, 0);
+        ret = 1;
+out:
+        return ret;
+}
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+        int ret;
+        while (con->out_skip > 0) {
+                struct kvec iov = {
+                        .iov_base = page_address(con->msgr->zero_page),
+                        .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
+                };
+                ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
+                if (ret <= 0)
+                        goto out;
+                con->out_skip -= ret;
+        }
+        ret = 1;
+out:
+        return ret;
+}
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+        dout("prepare_read_banner %p\n", con);
+        con->in_base_pos = 0;
+}
+static void prepare_read_connect(struct ceph_connection *con)
+{
+        dout("prepare_read_connect %p\n", con);
+        con->in_base_pos = 0;
+}
+static void prepare_read_ack(struct ceph_connection *con)
+{
+        dout("prepare_read_ack %p\n", con);
+        con->in_base_pos = 0;
+}
+static void prepare_read_tag(struct ceph_connection *con)
+{
+        dout("prepare_read_tag %p\n", con);
+        con->in_base_pos = 0;
+        con->in_tag = CEPH_MSGR_TAG_READY;
+}
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+        dout("prepare_read_message %p\n", con);
+        BUG_ON(con->in_msg != NULL);
+        con->in_base_pos = 0;
+        con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+        return 0;
+}
+static int read_partial(struct ceph_connection *con,
+                        int *to, int size, void *object)
+{
+        *to += size;
+        while (con->in_base_pos < *to) {
+                int left = *to - con->in_base_pos;
+                int have = size - left;
+                int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+                if (ret <= 0)
+                        return ret;
+                con->in_base_pos += ret;
+        }
+        return 1;
+}
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+        int ret, to = 0;
+        dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+        /* peer's banner */
+        ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
+        if (ret <= 0)
+                goto out;
+        ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
+                           &con->actual_peer_addr);
+        if (ret <= 0)
+                goto out;
+        ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
+                           &con->peer_addr_for_me);
+        if (ret <= 0)
+                goto out;
+out:
+        return ret;
+}
+static int read_partial_connect(struct ceph_connection *con)
+{
+        int ret, to = 0;
+        dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+        ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
+        if (ret <= 0)
+                goto out;
+        ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
+                           con->auth_reply_buf);
+        if (ret <= 0)
+                goto out;
+        dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+             con, (int)con->in_reply.tag,
+             le32_to_cpu(con->in_reply.connect_seq),
+             le32_to_cpu(con->in_reply.global_seq));
+out:
+        return ret;
+}
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+        if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+                pr_err("connect to %s got bad banner\n",
+                       pr_addr(&con->peer_addr.in_addr));
+                con->error_msg = "protocol error, bad banner";
+                return -1;
+        }
+        return 0;
+}
+static bool addr_is_blank(struct sockaddr_storage *ss)
+{
+        switch (ss->ss_family) {
+        case AF_INET:
+                return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+        case AF_INET6:
+                return
+                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
+                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
+                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
+                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+        }
+        return false;
+}
+static int addr_port(struct sockaddr_storage *ss)
+{
+        switch (ss->ss_family) {
+        case AF_INET:
+                return ntohs(((struct sockaddr_in *)ss)->sin_port);
+        case AF_INET6:
+                return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+        }
+        return 0;
+}
+static void addr_set_port(struct sockaddr_storage *ss, int p)
+{
+        switch (ss->ss_family) {
+        case AF_INET:
+                ((struct sockaddr_in *)ss)->sin_port = htons(p);
+        case AF_INET6:
+                ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+        }
+}
+/*
+ * Parse an ip[:port] list into an addr array.  Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+                   struct ceph_entity_addr *addr,
+                   int max_count, int *count)
+{
+        int i;
+        const char *p = c;
+        dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+        for (i = 0; i < max_count; i++) {
+                const char *ipend;
+                struct sockaddr_storage *ss = &addr[i].in_addr;
+                struct sockaddr_in *in4 = (void *)ss;
+                struct sockaddr_in6 *in6 = (void *)ss;
+                int port;
+                memset(ss, 0, sizeof(*ss));
+                if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
+                             ',', &ipend)) {
+                        ss->ss_family = AF_INET;
+                } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+                                    ',', &ipend)) {
+                        ss->ss_family = AF_INET6;
+                } else {
+                        goto bad;
+                }
+                p = ipend;
+                /* port? */
+                if (p < end && *p == ':') {
+                        port = 0;
+                        p++;
+                        while (p < end && *p >= '0' && *p <= '9') {
+                                port = (port * 10) + (*p - '0');
+                                p++;
+                        }
+                        if (port > 65535 || port == 0)
+                                goto bad;
+                } else {
+                        port = CEPH_MON_PORT;
+                }
+                addr_set_port(ss, port);
+                dout("parse_ips got %s\n", pr_addr(ss));
+                if (p == end)
+                        break;
+                if (*p != ',')
+                        goto bad;
+                p++;
+        }
+        if (p != end)
+                goto bad;
+        if (count)
+                *count = i + 1;
+        return 0;
+bad:
+        pr_err("parse_ips bad ip '%s'\n", c);
+        return -EINVAL;
+}
+static int process_banner(struct ceph_connection *con)
+{
+        dout("process_banner on %p\n", con);
+        if (verify_hello(con) < 0)
+                return -1;
+        ceph_decode_addr(&con->actual_peer_addr);
+        ceph_decode_addr(&con->peer_addr_for_me);
+        /*
+         * Make sure the other end is who we wanted.  note that the other
+         * end may not yet know their ip address, so if it's 0.0.0.0, give
+         * them the benefit of the doubt.
+         */
+        if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+                   sizeof(con->peer_addr)) != 0 &&
+            !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+              con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+                pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
+                           pr_addr(&con->peer_addr.in_addr),
+                           le64_to_cpu(con->peer_addr.nonce),
+                           pr_addr(&con->actual_peer_addr.in_addr),
+                           le64_to_cpu(con->actual_peer_addr.nonce));
+                con->error_msg = "wrong peer at address";
+                return -1;
+        }
+        /*
+         * did we learn our address?
+         */
+        if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
+                int port = addr_port(&con->msgr->inst.addr.in_addr);
+                memcpy(&con->msgr->inst.addr.in_addr,
+                       &con->peer_addr_for_me.in_addr,
+                       sizeof(con->peer_addr_for_me.in_addr));
+                addr_set_port(&con->msgr->inst.addr.in_addr, port);
+                encode_my_addr(con->msgr);
+                dout("process_banner learned my addr is %s\n",
+                     pr_addr(&con->msgr->inst.addr.in_addr));
+        }
+        set_bit(NEGOTIATING, &con->state);
+        prepare_read_connect(con);
+        return 0;
+}
+static void fail_protocol(struct ceph_connection *con)
+{
+        reset_connection(con);
+        set_bit(CLOSED, &con->state);  /* in case there's queued work */
+        mutex_unlock(&con->mutex);
+        if (con->ops->bad_proto)
+                con->ops->bad_proto(con);
+        mutex_lock(&con->mutex);
+}
+static int process_connect(struct ceph_connection *con)
+{
+        u64 sup_feat = CEPH_FEATURE_SUPPORTED;
+        u64 req_feat = CEPH_FEATURE_REQUIRED;
+        u64 server_feat = le64_to_cpu(con->in_reply.features);
+        dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+        switch (con->in_reply.tag) {
+        case CEPH_MSGR_TAG_FEATURES:
+                pr_err("%s%lld %s feature set mismatch,"
+                       " my %llx < server's %llx, missing %llx\n",
+                       ENTITY_NAME(con->peer_name),
+                       pr_addr(&con->peer_addr.in_addr),
+                       sup_feat, server_feat, server_feat & ~sup_feat);
+                con->error_msg = "missing required protocol features";
+                fail_protocol(con);
+                return -1;
+        case CEPH_MSGR_TAG_BADPROTOVER:
+                pr_err("%s%lld %s protocol version mismatch,"
+                       " my %d != server's %d\n",
+                       ENTITY_NAME(con->peer_name),
+                       pr_addr(&con->peer_addr.in_addr),
+                       le32_to_cpu(con->out_connect.protocol_version),
+                       le32_to_cpu(con->in_reply.protocol_version));
+                con->error_msg = "protocol version mismatch";
+                fail_protocol(con);
+                return -1;
+        case CEPH_MSGR_TAG_BADAUTHORIZER:
+                con->auth_retry++;
+                dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+                     con->auth_retry);
+                if (con->auth_retry == 2) {
+                        con->error_msg = "connect authorization failure";
+                        reset_connection(con);
+                        set_bit(CLOSED, &con->state);
+                        return -1;
+                }
+                con->auth_retry = 1;
+                prepare_write_connect(con->msgr, con, 0);
+                prepare_read_connect(con);
+                break;
+        case CEPH_MSGR_TAG_RESETSESSION:
+                /*
+                 * If we connected with a large connect_seq but the peer
+                 * has no record of a session with us (no connection, or
+                 * connect_seq == 0), they will send RESETSESION to indicate
+                 * that they must have reset their session, and may have
+                 * dropped messages.
+                 */
+                dout("process_connect got RESET peer seq %u\n",
+                     le32_to_cpu(con->in_connect.connect_seq));
+                pr_err("%s%lld %s connection reset\n",
+                       ENTITY_NAME(con->peer_name),
+                       pr_addr(&con->peer_addr.in_addr));
+                reset_connection(con);
+                prepare_write_connect(con->msgr, con, 0);
+                prepare_read_connect(con);
+                /* Tell ceph about it. */
+                mutex_unlock(&con->mutex);
+                pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
+                if (con->ops->peer_reset)
+                        con->ops->peer_reset(con);
+                mutex_lock(&con->mutex);
+                break;
+        case CEPH_MSGR_TAG_RETRY_SESSION:
+                /*
+                 * If we sent a smaller connect_seq than the peer has, try
+                 * again with a larger value.
+                 */
+                dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
+                     le32_to_cpu(con->out_connect.connect_seq),
+                     le32_to_cpu(con->in_connect.connect_seq));
+                con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+                prepare_write_connect(con->msgr, con, 0);
+                prepare_read_connect(con);
+                break;
+        case CEPH_MSGR_TAG_RETRY_GLOBAL:
+                /*
+                 * If we sent a smaller global_seq than the peer has, try
+                 * again with a larger value.
+                 */
+                dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+                     con->peer_global_seq,
+                     le32_to_cpu(con->in_connect.global_seq));
+                get_global_seq(con->msgr,
+                               le32_to_cpu(con->in_connect.global_seq));
+                prepare_write_connect(con->msgr, con, 0);
+                prepare_read_connect(con);
+                break;
+        case CEPH_MSGR_TAG_READY:
+                if (req_feat & ~server_feat) {
+                        pr_err("%s%lld %s protocol feature mismatch,"
+                               " my required %llx > server's %llx, need %llx\n",
+                               ENTITY_NAME(con->peer_name),
+                               pr_addr(&con->peer_addr.in_addr),
+                               req_feat, server_feat, req_feat & ~server_feat);
+                        con->error_msg = "missing required protocol features";
+                        fail_protocol(con);
+                        return -1;
+                }
+                clear_bit(CONNECTING, &con->state);
+                con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+                con->connect_seq++;
+                dout("process_connect got READY gseq %d cseq %d (%d)\n",
+                     con->peer_global_seq,
+                     le32_to_cpu(con->in_reply.connect_seq),
+                     con->connect_seq);
+                WARN_ON(con->connect_seq !=
+                        le32_to_cpu(con->in_reply.connect_seq));
+                if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+                        set_bit(LOSSYTX, &con->state);
+                prepare_read_tag(con);
+                break;
+        case CEPH_MSGR_TAG_WAIT:
+                /*
+                 * If there is a connection race (we are opening
+                 * connections to each other), one of us may just have
+                 * to WAIT.  This shouldn't happen if we are the
+                 * client.
+                 */
+                pr_err("process_connect peer connecting WAIT\n");
+        default:
+                pr_err("connect protocol error, will retry\n");
+                con->error_msg = "protocol error, garbage tag during connect";
+                return -1;
+        }
+        return 0;
+}
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+        int to = 0;
+        return read_partial(con, &to, sizeof(con->in_temp_ack),
+                            &con->in_temp_ack);
+}
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+        struct ceph_msg *m;
+        u64 ack = le64_to_cpu(con->in_temp_ack);
+        u64 seq;
+        while (!list_empty(&con->out_sent)) {
+                m = list_first_entry(&con->out_sent, struct ceph_msg,
+                                     list_head);
+                seq = le64_to_cpu(m->hdr.seq);
+                if (seq > ack)
+                        break;
+                dout("got ack for seq %llu type %d at %p\n", seq,
+                     le16_to_cpu(m->hdr.type), m);
+                ceph_msg_remove(m);
+        }
+        prepare_read_tag(con);
+}
+static int read_partial_message_section(struct ceph_connection *con,
+                                        struct kvec *section, unsigned int sec_len,
+                                        u32 *crc)
+{
+        int left;
+        int ret;
+        BUG_ON(!section);
+        while (section->iov_len < sec_len) {
+                BUG_ON(section->iov_base == NULL);
+                left = sec_len - section->iov_len;
+                ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+                                       section->iov_len, left);
+                if (ret <= 0)
+                        return ret;
+                section->iov_len += ret;
+                if (section->iov_len == sec_len)
+                        *crc = crc32c(0, section->iov_base,
+                                      section->iov_len);
+        }
+        return 1;
+}
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+                                struct ceph_msg_header *hdr,
+                                int *skip);
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+        struct ceph_msg *m = con->in_msg;
+        void *p;
+        int ret;
+        int to, left;
+        unsigned front_len, middle_len, data_len, data_off;
+        int datacrc = con->msgr->nocrc;
+        int skip;
+        u64 seq;
+        dout("read_partial_message con %p msg %p\n", con, m);
+        /* header */
+        while (con->in_base_pos < sizeof(con->in_hdr)) {
+                left = sizeof(con->in_hdr) - con->in_base_pos;
+                ret = ceph_tcp_recvmsg(con->sock,
+                                       (char *)&con->in_hdr + con->in_base_pos,
+                                       left);
+                if (ret <= 0)
+                        return ret;
+                con->in_base_pos += ret;
+                if (con->in_base_pos == sizeof(con->in_hdr)) {
+                        u32 crc = crc32c(0, (void *)&con->in_hdr,
+                                 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
+                        if (crc != le32_to_cpu(con->in_hdr.crc)) {
+                                pr_err("read_partial_message bad hdr "
+                                       " crc %u != expected %u\n",
+                                       crc, con->in_hdr.crc);
+                                return -EBADMSG;
+                        }
+                }
+        }
+        front_len = le32_to_cpu(con->in_hdr.front_len);
+        if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+                return -EIO;
+        middle_len = le32_to_cpu(con->in_hdr.middle_len);
+        if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+                return -EIO;
+        data_len = le32_to_cpu(con->in_hdr.data_len);
+        if (data_len > CEPH_MSG_MAX_DATA_LEN)
+                return -EIO;
+        data_off = le16_to_cpu(con->in_hdr.data_off);
+        /* verify seq# */
+        seq = le64_to_cpu(con->in_hdr.seq);
+        if ((s64)seq - (s64)con->in_seq < 1) {
+                pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
+                        ENTITY_NAME(con->peer_name),
+                        pr_addr(&con->peer_addr.in_addr),
+                        seq, con->in_seq + 1);
+                con->in_base_pos = -front_len - middle_len - data_len -
+                        sizeof(m->footer);
+                con->in_tag = CEPH_MSGR_TAG_READY;
+                con->in_seq++;
+                return 0;
+        } else if ((s64)seq - (s64)con->in_seq > 1) {
+                pr_err("read_partial_message bad seq %lld expected %lld\n",
+                       seq, con->in_seq + 1);
+                con->error_msg = "bad message sequence # for incoming message";
+                return -EBADMSG;
+        }
+        /* allocate message? */
+        if (!con->in_msg) {
+                dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+                     con->in_hdr.front_len, con->in_hdr.data_len);
+                con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
+                if (skip) {
+                        /* skip this message */
+                        dout("alloc_msg returned NULL, skipping message\n");
+                        con->in_base_pos = -front_len - middle_len - data_len -
+                                sizeof(m->footer);
+                        con->in_tag = CEPH_MSGR_TAG_READY;
+                        con->in_seq++;
+                        return 0;
+                }
+                if (IS_ERR(con->in_msg)) {
+                        ret = PTR_ERR(con->in_msg);
+                        con->in_msg = NULL;
+                        con->error_msg =
+                                "error allocating memory for incoming message";
+                        return ret;
+                }
+                m = con->in_msg;
+                m->front.iov_len = 0;    /* haven't read it yet */
+                if (m->middle)
+                        m->middle->vec.iov_len = 0;
+                con->in_msg_pos.page = 0;
+                con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+                con->in_msg_pos.data_pos = 0;
+        }
+        /* front */
+        ret = read_partial_message_section(con, &m->front, front_len,
+                                           &con->in_front_crc);
+        if (ret <= 0)
+                return ret;
+        /* middle */
+        if (m->middle) {
+                ret = read_partial_message_section(con, &m->middle->vec, middle_len,
+                                                   &con->in_middle_crc);
+                if (ret <= 0)
+                        return ret;
+        }
+        /* (page) data */
+        while (con->in_msg_pos.data_pos < data_len) {
+                left = min((int)(data_len - con->in_msg_pos.data_pos),
+                           (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+                BUG_ON(m->pages == NULL);
+                p = kmap(m->pages[con->in_msg_pos.page]);
+                ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+                                       left);
+                if (ret > 0 && datacrc)
+                        con->in_data_crc =
+                                crc32c(con->in_data_crc,
+                                          p + con->in_msg_pos.page_pos, ret);
+                kunmap(m->pages[con->in_msg_pos.page]);
+                if (ret <= 0)
+                        return ret;
+                con->in_msg_pos.data_pos += ret;
+                con->in_msg_pos.page_pos += ret;
+                if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+                        con->in_msg_pos.page_pos = 0;
+                        con->in_msg_pos.page++;
+                }
+        }
+        /* footer */
+        to = sizeof(m->hdr) + sizeof(m->footer);
+        while (con->in_base_pos < to) {
+                left = to - con->in_base_pos;
+                ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
+                                       (con->in_base_pos - sizeof(m->hdr)),
+                                       left);
+                if (ret <= 0)
+                        return ret;
+                con->in_base_pos += ret;
+        }
+        dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+             m, front_len, m->footer.front_crc, middle_len,
+             m->footer.middle_crc, data_len, m->footer.data_crc);
+        /* crc ok? */
+        if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+                pr_err("read_partial_message %p front crc %u != exp. %u\n",
+                       m, con->in_front_crc, m->footer.front_crc);
+                return -EBADMSG;
+        }
+        if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+                pr_err("read_partial_message %p middle crc %u != exp %u\n",
+                       m, con->in_middle_crc, m->footer.middle_crc);
+                return -EBADMSG;
+        }
+        if (datacrc &&
+            (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+            con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+                pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+                       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+                return -EBADMSG;
+        }
+        return 1; /* done! */
+}
+/*
+ * Process message.  This happens in the worker thread.  The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+static void process_message(struct ceph_connection *con)
+{
+        struct ceph_msg *msg;
+        msg = con->in_msg;
+        con->in_msg = NULL;
+        /* if first message, set peer_name */
+        if (con->peer_name.type == 0)
+                con->peer_name = msg->hdr.src.name;
+        con->in_seq++;
+        mutex_unlock(&con->mutex);
+        dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+             msg, le64_to_cpu(msg->hdr.seq),
+             ENTITY_NAME(msg->hdr.src.name),
+             le16_to_cpu(msg->hdr.type),
+             ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+             le32_to_cpu(msg->hdr.front_len),
+             le32_to_cpu(msg->hdr.data_len),
+             con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+        con->ops->dispatch(con, msg);
+        mutex_lock(&con->mutex);
+        prepare_read_tag(con);
+}
+/*
+ * Write something to the socket.  Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+static int try_write(struct ceph_connection *con)
+{
+        struct ceph_messenger *msgr = con->msgr;
+        int ret = 1;
+        dout("try_write start %p state %lu nref %d\n", con, con->state,
+             atomic_read(&con->nref));
+        mutex_lock(&con->mutex);
+more:
+        dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+        /* open the socket first? */
+        if (con->sock == NULL) {
+                /*
+                 * if we were STANDBY and are reconnecting _this_
+                 * connection, bump connect_seq now.  Always bump
+                 * global_seq.
+                 */
+                if (test_and_clear_bit(STANDBY, &con->state))
+                        con->connect_seq++;
+                prepare_write_banner(msgr, con);
+                prepare_write_connect(msgr, con, 1);
+                prepare_read_banner(con);
+                set_bit(CONNECTING, &con->state);
+                clear_bit(NEGOTIATING, &con->state);
+                BUG_ON(con->in_msg);
+                con->in_tag = CEPH_MSGR_TAG_READY;
+                dout("try_write initiating connect on %p new state %lu\n",
+                     con, con->state);
+                con->sock = ceph_tcp_connect(con);
+                if (IS_ERR(con->sock)) {
+                        con->sock = NULL;
+                        con->error_msg = "connect error";
+                        ret = -1;
+                        goto out;
+                }
+        }
+more_kvec:
+        /* kvec data queued? */
+        if (con->out_skip) {
+                ret = write_partial_skip(con);
+                if (ret <= 0)
+                        goto done;
+                if (ret < 0) {
+                        dout("try_write write_partial_skip err %d\n", ret);
+                        goto done;
+                }
+        }
+        if (con->out_kvec_left) {
+                ret = write_partial_kvec(con);
+                if (ret <= 0)
+                        goto done;
+        }
+        /* msg pages? */
+        if (con->out_msg) {
+                if (con->out_msg_done) {
+                        ceph_msg_put(con->out_msg);
+                        con->out_msg = NULL;   /* we're done with this one */
+                        goto do_next;
+                }
+                ret = write_partial_msg_pages(con);
+                if (ret == 1)
+                        goto more_kvec;  /* we need to send the footer, too! */
+                if (ret == 0)
+                        goto done;
+                if (ret < 0) {
+                        dout("try_write write_partial_msg_pages err %d\n",
+                             ret);
+                        goto done;
+                }
+        }
+do_next:
+        if (!test_bit(CONNECTING, &con->state)) {
+                /* is anything else pending? */
+                if (!list_empty(&con->out_queue)) {
+                        prepare_write_message(con);
+                        goto more;
+                }
+                if (con->in_seq > con->in_seq_acked) {
+                        prepare_write_ack(con);
+                        goto more;
+                }
+                if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+                        prepare_write_keepalive(con);
+                        goto more;
+                }
+        }
+        /* Nothing to do! */
+        clear_bit(WRITE_PENDING, &con->state);
+        dout("try_write nothing else to write.\n");
+done:
+        ret = 0;
+out:
+        mutex_unlock(&con->mutex);
+        dout("try_write done on %p\n", con);
+        return ret;
+}
+/*
+ * Read what we can from the socket.
+ */
+static int try_read(struct ceph_connection *con)
+{
+        struct ceph_messenger *msgr;
+        int ret = -1;
+        if (!con->sock)
+                return 0;
+        if (test_bit(STANDBY, &con->state))
+                return 0;
+        dout("try_read start on %p\n", con);
+        msgr = con->msgr;
+        mutex_lock(&con->mutex);
+more:
+        dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+             con->in_base_pos);
+        if (test_bit(CONNECTING, &con->state)) {
+                if (!test_bit(NEGOTIATING, &con->state)) {
+                        dout("try_read connecting\n");
+                        ret = read_partial_banner(con);
+                        if (ret <= 0)
+                                goto done;
+                        if (process_banner(con) < 0) {
+                                ret = -1;
+                                goto out;
+                        }
+                }
+                ret = read_partial_connect(con);
+                if (ret <= 0)
+                        goto done;
+                if (process_connect(con) < 0) {
+                        ret = -1;
+                        goto out;
+                }
+                goto more;
+        }
+        if (con->in_base_pos < 0) {
+                /*
+                 * skipping + discarding content.
+                 *
+                 * FIXME: there must be a better way to do this!
+                 */
+                static char buf[1024];
+                int skip = min(1024, -con->in_base_pos);
+                dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
+                ret = ceph_tcp_recvmsg(con->sock, buf, skip);
+                if (ret <= 0)
+                        goto done;
+                con->in_base_pos += ret;
+                if (con->in_base_pos)
+                        goto more;
+        }
+        if (con->in_tag == CEPH_MSGR_TAG_READY) {
+                /*
+                 * what's next?
+                 */
+                ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+                if (ret <= 0)
+                        goto done;
+                dout("try_read got tag %d\n", (int)con->in_tag);
+                switch (con->in_tag) {
+                case CEPH_MSGR_TAG_MSG:
+                        prepare_read_message(con);
+                        break;
+                case CEPH_MSGR_TAG_ACK:
+                        prepare_read_ack(con);
+                        break;
+                case CEPH_MSGR_TAG_CLOSE:
+                        set_bit(CLOSED, &con->state);   /* fixme */
+                        goto done;
+                default:
+                        goto bad_tag;
+                }
+        }
+        if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+                ret = read_partial_message(con);
+                if (ret <= 0) {
+                        switch (ret) {
+                        case -EBADMSG:
+                                con->error_msg = "bad crc";
+                                ret = -EIO;
+                                goto out;
+                        case -EIO:
+                                con->error_msg = "io error";
+                                goto out;
+                        default:
+                                goto done;
+                        }
+                }
+                if (con->in_tag == CEPH_MSGR_TAG_READY)
+                        goto more;
+                process_message(con);
+                goto more;
+        }
+        if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+                ret = read_partial_ack(con);
+                if (ret <= 0)
+                        goto done;
+                process_ack(con);
+                goto more;
+        }
+done:
+        ret = 0;
+out:
+        mutex_unlock(&con->mutex);
+        dout("try_read done on %p\n", con);
+        return ret;
+bad_tag:
+        pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+        con->error_msg = "protocol error, garbage tag";
+        ret = -1;
+        goto out;
+}
+/*
+ * Atomically queue work on a connection.  Bump @con reference to
+ * avoid races with connection teardown.
+ *
+ * There is some trickery going on with QUEUED and BUSY because we
+ * only want a _single_ thread operating on each connection at any
+ * point in time, but we want to use all available CPUs.
+ *
+ * The worker thread only proceeds if it can atomically set BUSY.  It
+ * clears QUEUED and does it's thing.  When it thinks it's done, it
+ * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
+ * (tries again to set BUSY).
+ *
+ * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
+ * try to queue work.  If that fails (work is already queued, or BUSY)
+ * we give up (work also already being done or is queued) but leave QUEUED
+ * set so that the worker thread will loop if necessary.
+ */
+static void queue_con(struct ceph_connection *con)
+{
+        if (test_bit(DEAD, &con->state)) {
+                dout("queue_con %p ignoring: DEAD\n",
+                     con);
+                return;
+        }
+        if (!con->ops->get(con)) {
+                dout("queue_con %p ref count 0\n", con);
+                return;
+        }
+        set_bit(QUEUED, &con->state);
+        if (test_bit(BUSY, &con->state)) {
+                dout("queue_con %p - already BUSY\n", con);
+                con->ops->put(con);
+        } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
+                dout("queue_con %p - already queued\n", con);
+                con->ops->put(con);
+        } else {
+                dout("queue_con %p\n", con);
+        }
+}
+/*
+ * Do some work on a connection.  Drop a connection ref when we're done.
+ */
+static void con_work(struct work_struct *work)
+{
+        struct ceph_connection *con = container_of(work, struct ceph_connection,
+                                                   work.work);
+        int backoff = 0;
+more:
+        if (test_and_set_bit(BUSY, &con->state) != 0) {
+                dout("con_work %p BUSY already set\n", con);
+                goto out;
+        }
+        dout("con_work %p start, clearing QUEUED\n", con);
+        clear_bit(QUEUED, &con->state);
+        if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
+                dout("con_work CLOSED\n");
+                con_close_socket(con);
+                goto done;
+        }
+        if (test_and_clear_bit(OPENING, &con->state)) {
+                /* reopen w/ new peer */
+                dout("con_work OPENING\n");
+                con_close_socket(con);
+        }
+        if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
+            try_read(con) < 0 ||
+            try_write(con) < 0) {
+                backoff = 1;
+                ceph_fault(con);     /* error/fault path */
+        }
+done:
+        clear_bit(BUSY, &con->state);
+        dout("con->state=%lu\n", con->state);
+        if (test_bit(QUEUED, &con->state)) {
+                if (!backoff || test_bit(OPENING, &con->state)) {
+                        dout("con_work %p QUEUED reset, looping\n", con);
+                        goto more;
+                }
+                dout("con_work %p QUEUED reset, but just faulted\n", con);
+                clear_bit(QUEUED, &con->state);
+        }
+        dout("con_work %p done\n", con);
+out:
+        con->ops->put(con);
+}
+/*
+ * Generic error/fault handler.  A retry mechanism is used with
+ * exponential backoff
+ */
+static void ceph_fault(struct ceph_connection *con)
+{
+        pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+               pr_addr(&con->peer_addr.in_addr), con->error_msg);
+        dout("fault %p state %lu to peer %s\n",
+             con, con->state, pr_addr(&con->peer_addr.in_addr));
+        if (test_bit(LOSSYTX, &con->state)) {
+                dout("fault on LOSSYTX channel\n");
+                goto out;
+        }
+        mutex_lock(&con->mutex);
+        if (test_bit(CLOSED, &con->state))
+                goto out_unlock;
+        con_close_socket(con);
+        if (con->in_msg) {
+                ceph_msg_put(con->in_msg);
+                con->in_msg = NULL;
+        }
+        /* Requeue anything that hasn't been acked */
+        list_splice_init(&con->out_sent, &con->out_queue);
+        /* If there are no messages in the queue, place the connection
+         * in a STANDBY state (i.e., don't try to reconnect just yet). */
+        if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
+                dout("fault setting STANDBY\n");
+                set_bit(STANDBY, &con->state);
+        } else {
+                /* retry after a delay. */
+                if (con->delay == 0)
+                        con->delay = BASE_DELAY_INTERVAL;
+                else if (con->delay < MAX_DELAY_INTERVAL)
+                        con->delay *= 2;
+                dout("fault queueing %p delay %lu\n", con, con->delay);
+                con->ops->get(con);
+                if (queue_delayed_work(ceph_msgr_wq, &con->work,
+                                       round_jiffies_relative(con->delay)) == 0)
+                        con->ops->put(con);
+        }
+out_unlock:
+        mutex_unlock(&con->mutex);
+out:
+        /*
+         * in case we faulted due to authentication, invalidate our
+         * current tickets so that we can get new ones.
+         */
+        if (con->auth_retry && con->ops->invalidate_authorizer) {
+                dout("calling invalidate_authorizer()\n");
+                con->ops->invalidate_authorizer(con);
+        }
+        if (con->ops->fault)
+                con->ops->fault(con);
+}
+/*
+ * create a new messenger instance
+ */
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
+{
+        struct ceph_messenger *msgr;
+        msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
+        if (msgr == NULL)
+                return ERR_PTR(-ENOMEM);
+        spin_lock_init(&msgr->global_seq_lock);
+        /* the zero page is needed if a request is "canceled" while the message
+         * is being written over the socket */
+        msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+        if (!msgr->zero_page) {
+                kfree(msgr);
+                return ERR_PTR(-ENOMEM);
+        }
+        kmap(msgr->zero_page);
+        if (myaddr)
+                msgr->inst.addr = *myaddr;
+        /* select a random nonce */
+        msgr->inst.addr.type = 0;
+        get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
+        encode_my_addr(msgr);
+        dout("messenger_create %p\n", msgr);
+        return msgr;
+}
+void ceph_messenger_destroy(struct ceph_messenger *msgr)
+{
+        dout("destroy %p\n", msgr);
+        kunmap(msgr->zero_page);
+        __free_page(msgr->zero_page);
+        kfree(msgr);
+        dout("destroyed messenger %p\n", msgr);
+}
+/*
+ * Queue up an outgoing message on the given connection.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        if (test_bit(CLOSED, &con->state)) {
+                dout("con_send %p closed, dropping %p\n", con, msg);
+                ceph_msg_put(msg);
+                return;
+        }
+        /* set src+dst */
+        msg->hdr.src.name = con->msgr->inst.name;
+        msg->hdr.src.addr = con->msgr->my_enc_addr;
+        msg->hdr.orig_src = msg->hdr.src;
+        BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+        msg->needs_out_seq = true;
+        /* queue */
+        mutex_lock(&con->mutex);
+        BUG_ON(!list_empty(&msg->list_head));
+        list_add_tail(&msg->list_head, &con->out_queue);
+        dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+             ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+             ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+             le32_to_cpu(msg->hdr.front_len),
+             le32_to_cpu(msg->hdr.middle_len),
+             le32_to_cpu(msg->hdr.data_len));
+        mutex_unlock(&con->mutex);
+        /* if there wasn't anything waiting to send before, queue
+         * new work */
+        if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+                queue_con(con);
+}
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        mutex_lock(&con->mutex);
+        if (!list_empty(&msg->list_head)) {
+                dout("con_revoke %p msg %p\n", con, msg);
+                list_del_init(&msg->list_head);
+                ceph_msg_put(msg);
+                msg->hdr.seq = 0;
+                if (con->out_msg == msg) {
+                        ceph_msg_put(con->out_msg);
+                        con->out_msg = NULL;
+                }
+                if (con->out_kvec_is_msg) {
+                        con->out_skip = con->out_kvec_bytes;
+                        con->out_kvec_is_msg = false;
+                }
+        } else {
+                dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
+        }
+        mutex_unlock(&con->mutex);
+}
+/*
+ * Revoke a message that we may be reading data into
+ */
+void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        mutex_lock(&con->mutex);
+        if (con->in_msg && con->in_msg == msg) {
+                unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
+                unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
+                unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
+                /* skip rest of message */
+                dout("con_revoke_pages %p msg %p revoked\n", con, msg);
+                        con->in_base_pos = con->in_base_pos -
+                                sizeof(struct ceph_msg_header) -
+                                front_len -
+                                middle_len -
+                                data_len -
+                                sizeof(struct ceph_msg_footer);
+                ceph_msg_put(con->in_msg);
+                con->in_msg = NULL;
+                con->in_tag = CEPH_MSGR_TAG_READY;
+                con->in_seq++;
+        } else {
+                dout("con_revoke_pages %p msg %p pages %p no-op\n",
+                     con, con->in_msg, msg);
+        }
+        mutex_unlock(&con->mutex);
+}
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+        if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
+            test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+                queue_con(con);
+}
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new(int type, int front_len,
+                              int page_len, int page_off, struct page **pages)
+{
+        struct ceph_msg *m;
+        m = kmalloc(sizeof(*m), GFP_NOFS);
+        if (m == NULL)
+                goto out;
+        kref_init(&m->kref);
+        INIT_LIST_HEAD(&m->list_head);
+        m->hdr.tid = 0;
+        m->hdr.type = cpu_to_le16(type);
+        m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+        m->hdr.version = 0;
+        m->hdr.front_len = cpu_to_le32(front_len);
+        m->hdr.middle_len = 0;
+        m->hdr.data_len = cpu_to_le32(page_len);
+        m->hdr.data_off = cpu_to_le16(page_off);
+        m->hdr.reserved = 0;
+        m->footer.front_crc = 0;
+        m->footer.middle_crc = 0;
+        m->footer.data_crc = 0;
+        m->footer.flags = 0;
+        m->front_max = front_len;
+        m->front_is_vmalloc = false;
+        m->more_to_follow = false;
+        m->pool = NULL;
+        /* front */
+        if (front_len) {
+                if (front_len > PAGE_CACHE_SIZE) {
+                        m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+                                                      PAGE_KERNEL);
+                        m->front_is_vmalloc = true;
+                } else {
+                        m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+                }
+                if (m->front.iov_base == NULL) {
+                        pr_err("msg_new can't allocate %d bytes\n",
+                             front_len);
+                        goto out2;
+                }
+        } else {
+                m->front.iov_base = NULL;
+        }
+        m->front.iov_len = front_len;
+        /* middle */
+        m->middle = NULL;
+        /* data */
+        m->nr_pages = calc_pages_for(page_off, page_len);
+        m->pages = pages;
+        m->pagelist = NULL;
+        dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
+             m->nr_pages);
+        return m;
+out2:
+        ceph_msg_put(m);
+out:
+        pr_err("msg_new can't create type %d len %d\n", type, front_len);
+        return ERR_PTR(-ENOMEM);
+}
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg.  This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        int type = le16_to_cpu(msg->hdr.type);
+        int middle_len = le32_to_cpu(msg->hdr.middle_len);
+        dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+             ceph_msg_type_name(type), middle_len);
+        BUG_ON(!middle_len);
+        BUG_ON(msg->middle);
+        msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
+        if (!msg->middle)
+                return -ENOMEM;
+        return 0;
+}
+/*
+ * Generic message allocator, for incoming messages.
+ */
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+                                struct ceph_msg_header *hdr,
+                                int *skip)
+{
+        int type = le16_to_cpu(hdr->type);
+        int front_len = le32_to_cpu(hdr->front_len);
+        int middle_len = le32_to_cpu(hdr->middle_len);
+        struct ceph_msg *msg = NULL;
+        int ret;
+        if (con->ops->alloc_msg) {
+                mutex_unlock(&con->mutex);
+                msg = con->ops->alloc_msg(con, hdr, skip);
+                mutex_lock(&con->mutex);
+                if (IS_ERR(msg))
+                        return msg;
+                if (*skip)
+                        return NULL;
+        }
+        if (!msg) {
+                *skip = 0;
+                msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+                if (!msg) {
+                        pr_err("unable to allocate msg type %d len %d\n",
+                               type, front_len);
+                        return ERR_PTR(-ENOMEM);
+                }
+        }
+        memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+        if (middle_len) {
+                ret = ceph_alloc_middle(con, msg);
+                if (ret < 0) {
+                        ceph_msg_put(msg);
+                        return msg;
+                }
+        }
+        return msg;
+}
+/*
+ * Free a generically kmalloc'd message.
+ */
+void ceph_msg_kfree(struct ceph_msg *m)
+{
+        dout("msg_kfree %p\n", m);
+        if (m->front_is_vmalloc)
+                vfree(m->front.iov_base);
+        else
+                kfree(m->front.iov_base);
+        kfree(m);
+}
+/*
+ * Drop a msg ref.  Destroy as needed.
+ */
+void ceph_msg_last_put(struct kref *kref)
+{
+        struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+        dout("ceph_msg_put last one on %p\n", m);
+        WARN_ON(!list_empty(&m->list_head));
+        /* drop middle, data, if any */
+        if (m->middle) {
+                ceph_buffer_put(m->middle);
+                m->middle = NULL;
+        }
+        m->nr_pages = 0;
+        m->pages = NULL;
+        if (m->pagelist) {
+                ceph_pagelist_release(m->pagelist);
+                kfree(m->pagelist);
+                m->pagelist = NULL;
+        }
+        if (m->pool)
+                ceph_msgpool_put(m->pool, m);
+        else
+                ceph_msg_kfree(m);
+}
+void ceph_msg_dump(struct ceph_msg *msg)
+{
+        pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
+                 msg->front_max, msg->nr_pages);
+        print_hex_dump(KERN_DEBUG, "header: ",
+                       DUMP_PREFIX_OFFSET, 16, 1,
+                       &msg->hdr, sizeof(msg->hdr), true);
+        print_hex_dump(KERN_DEBUG, " front: ",
+                       DUMP_PREFIX_OFFSET, 16, 1,
+                       msg->front.iov_base, msg->front.iov_len, true);
+        if (msg->middle)
+                print_hex_dump(KERN_DEBUG, "middle: ",
+                               DUMP_PREFIX_OFFSET, 16, 1,
+                               msg->middle->vec.iov_base,
+                               msg->middle->vec.iov_len, true);
+        print_hex_dump(KERN_DEBUG, "footer: ",
+                       DUMP_PREFIX_OFFSET, 16, 1,
+                       &msg->footer, sizeof(msg->footer), true);
+}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..a5caf91cc971
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,256 @@
+#ifndef __FS_CEPH_MESSENGER_H
+#define __FS_CEPH_MESSENGER_H
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/radix-tree.h>
+#include <linux/uio.h>
+#include <linux/version.h>
+#include <linux/workqueue.h>
+#include "types.h"
+#include "buffer.h"
+struct ceph_msg;
+struct ceph_connection;
+extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
+/*
+ * Ceph defines these callbacks for handling connection events.
+ */
+struct ceph_connection_operations {
+        struct ceph_connection *(*get)(struct ceph_connection *);
+        void (*put)(struct ceph_connection *);
+        /* handle an incoming message. */
+        void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
+        /* authorize an outgoing connection */
+        int (*get_authorizer) (struct ceph_connection *con,
+                               void **buf, int *len, int *proto,
+                               void **reply_buf, int *reply_len, int force_new);
+        int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
+        int (*invalidate_authorizer)(struct ceph_connection *con);
+        /* protocol version mismatch */
+        void (*bad_proto) (struct ceph_connection *con);
+        /* there was some error on the socket (disconnect, whatever) */
+        void (*fault) (struct ceph_connection *con);
+        /* a remote host as terminated a message exchange session, and messages
+         * we sent (or they tried to send us) may be lost. */
+        void (*peer_reset) (struct ceph_connection *con);
+        struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
+                                        struct ceph_msg_header *hdr,
+                                        int *skip);
+};
+extern const char *ceph_name_type_str(int t);
+/* use format string %s%d */
+#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+struct ceph_messenger {
+        struct ceph_entity_inst inst;    /* my name+address */
+        struct ceph_entity_addr my_enc_addr;
+        struct page *zero_page;          /* used in certain error cases */
+        bool nocrc;
+        /*
+         * the global_seq counts connections i (attempt to) initiate
+         * in order to disambiguate certain connect race conditions.
+         */
+        u32 global_seq;
+        spinlock_t global_seq_lock;
+};
+/*
+ * a single message.  it contains a header (src, dest, message type, etc.),
+ * footer (crc values, mainly), a "front" message body, and possibly a
+ * data payload (stored in some number of pages).
+ */
+struct ceph_msg {
+        struct ceph_msg_header hdr;     /* header */
+        struct ceph_msg_footer footer;  /* footer */
+        struct kvec front;              /* unaligned blobs of message */
+        struct ceph_buffer *middle;
+        struct page **pages;            /* data payload.  NOT OWNER. */
+        unsigned nr_pages;              /* size of page array */
+        struct ceph_pagelist *pagelist; /* instead of pages */
+        struct list_head list_head;
+        struct kref kref;
+        bool front_is_vmalloc;
+        bool more_to_follow;
+        bool needs_out_seq;
+        int front_max;
+        struct ceph_msgpool *pool;
+};
+struct ceph_msg_pos {
+        int page, page_pos;  /* which page; offset in page */
+        int data_pos;        /* offset in data payload */
+        int did_page_crc;    /* true if we've calculated crc for current page */
+};
+/* ceph connection fault delay defaults, for exponential backoff */
+#define BASE_DELAY_INTERVAL     (HZ/2)
+#define MAX_DELAY_INTERVAL      (5 * 60 * HZ)
+/*
+ * ceph_connection state bit flags
+ *
+ * QUEUED and BUSY are used together to ensure that only a single
+ * thread is currently opening, reading or writing data to the socket.
+ */
+#define LOSSYTX         0  /* we can close channel or drop messages on errors */
+#define CONNECTING      1
+#define NEGOTIATING     2
+#define KEEPALIVE_PENDING      3
+#define WRITE_PENDING   4  /* we have data ready to send */
+#define QUEUED          5  /* there is work queued on this connection */
+#define BUSY            6  /* work is being done */
+#define STANDBY         8  /* no outgoing messages, socket closed.  we keep
+                            * the ceph_connection around to maintain shared
+                            * state with the peer. */
+#define CLOSED          10 /* we've closed the connection */
+#define SOCK_CLOSED     11 /* socket state changed to closed */
+#define OPENING         13 /* open connection w/ (possibly new) peer */
+#define DEAD            14 /* dead, about to kfree */
+/*
+ * A single connection with another host.
+ *
+ * We maintain a queue of outgoing messages, and some session state to
+ * ensure that we can preserve the lossless, ordered delivery of
+ * messages in the case of a TCP disconnect.
+ */
+struct ceph_connection {
+        void *private;
+        atomic_t nref;
+        const struct ceph_connection_operations *ops;
+        struct ceph_messenger *msgr;
+        struct socket *sock;
+        unsigned long state;    /* connection state (see flags above) */
+        const char *error_msg;  /* error message, if any */
+        struct ceph_entity_addr peer_addr; /* peer address */
+        struct ceph_entity_name peer_name; /* peer name */
+        struct ceph_entity_addr peer_addr_for_me;
+        u32 connect_seq;      /* identify the most recent connection
+                                 attempt for this connection, client */
+        u32 peer_global_seq;  /* peer's global seq for this connection */
+        int auth_retry;       /* true if we need a newer authorizer */
+        void *auth_reply_buf;   /* where to put the authorizer reply */
+        int auth_reply_buf_len;
+        struct mutex mutex;
+        /* out queue */
+        struct list_head out_queue;
+        struct list_head out_sent;   /* sending or sent but unacked */
+        u64 out_seq;                 /* last message queued for send */
+        u64 out_seq_sent;            /* last message sent */
+        bool out_keepalive_pending;
+        u64 in_seq, in_seq_acked;  /* last message received, acked */
+        /* connection negotiation temps */
+        char in_banner[CEPH_BANNER_MAX_LEN];
+        union {
+                struct {  /* outgoing connection */
+                        struct ceph_msg_connect out_connect;
+                        struct ceph_msg_connect_reply in_reply;
+                };
+                struct {  /* incoming */
+                        struct ceph_msg_connect in_connect;
+                        struct ceph_msg_connect_reply out_reply;
+                };
+        };
+        struct ceph_entity_addr actual_peer_addr;
+        /* message out temps */
+        struct ceph_msg *out_msg;        /* sending message (== tail of
+                                            out_sent) */
+        bool out_msg_done;
+        struct ceph_msg_pos out_msg_pos;
+        struct kvec out_kvec[8],         /* sending header/footer data */
+                *out_kvec_cur;
+        int out_kvec_left;   /* kvec's left in out_kvec */
+        int out_skip;        /* skip this many bytes */
+        int out_kvec_bytes;  /* total bytes left */
+        bool out_kvec_is_msg; /* kvec refers to out_msg */
+        int out_more;        /* there is more data after the kvecs */
+        __le64 out_temp_ack; /* for writing an ack */
+        /* message in temps */
+        struct ceph_msg_header in_hdr;
+        struct ceph_msg *in_msg;
+        struct ceph_msg_pos in_msg_pos;
+        u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
+        char in_tag;         /* protocol control byte */
+        int in_base_pos;     /* bytes read */
+        __le64 in_temp_ack;  /* for reading an ack */
+        struct delayed_work work;           /* send|recv work */
+        unsigned long       delay;          /* current delay interval */
+};
+extern const char *pr_addr(const struct sockaddr_storage *ss);
+extern int ceph_parse_ips(const char *c, const char *end,
+                          struct ceph_entity_addr *addr,
+                          int max_count, int *count);
+extern int ceph_msgr_init(void);
+extern void ceph_msgr_exit(void);
+extern struct ceph_messenger *ceph_messenger_create(
+        struct ceph_entity_addr *myaddr);
+extern void ceph_messenger_destroy(struct ceph_messenger *);
+extern void ceph_con_init(struct ceph_messenger *msgr,
+                          struct ceph_connection *con);
+extern void ceph_con_open(struct ceph_connection *con,
+                          struct ceph_entity_addr *addr);
+extern bool ceph_con_opened(struct ceph_connection *con);
+extern void ceph_con_close(struct ceph_connection *con);
+extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke_message(struct ceph_connection *con,
+                                  struct ceph_msg *msg);
+extern void ceph_con_keepalive(struct ceph_connection *con);
+extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
+extern void ceph_con_put(struct ceph_connection *con);
+extern struct ceph_msg *ceph_msg_new(int type, int front_len,
+                                     int page_len, int page_off,
+                                     struct page **pages);
+extern void ceph_msg_kfree(struct ceph_msg *m);
+static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+        kref_get(&msg->kref);
+        return msg;
+}
+extern void ceph_msg_last_put(struct kref *kref);
+static inline void ceph_msg_put(struct ceph_msg *msg)
+{
+        kref_put(&msg->kref, ceph_msg_last_put);
+}
+extern void ceph_msg_dump(struct ceph_msg *msg);
+#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..8fdc011ca956
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,835 @@
+#include "ceph_debug.h"
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include "mon_client.h"
+#include "super.h"
+#include "auth.h"
+#include "decode.h"
+/*
+ * Interact with Ceph monitor cluster.  Handle requests for new map
+ * versions, and periodically resend as needed.  Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information.  An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates.  We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure.  If the connection does break, we
+ * randomly hunt for a new monitor.  Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+const static struct ceph_connection_operations mon_con_ops;
+static int __validate_auth(struct ceph_mon_client *monc);
+/*
+ * Decode a monmap blob (e.g., during mount).
+ */
+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+{
+        struct ceph_monmap *m = NULL;
+        int i, err = -EINVAL;
+        struct ceph_fsid fsid;
+        u32 epoch, num_mon;
+        u16 version;
+        u32 len;
+        ceph_decode_32_safe(&p, end, len, bad);
+        ceph_decode_need(&p, end, len, bad);
+        dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+        ceph_decode_16_safe(&p, end, version, bad);
+        ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
+        ceph_decode_copy(&p, &fsid, sizeof(fsid));
+        epoch = ceph_decode_32(&p);
+        num_mon = ceph_decode_32(&p);
+        ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+        if (num_mon >= CEPH_MAX_MON)
+                goto bad;
+        m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
+        if (m == NULL)
+                return ERR_PTR(-ENOMEM);
+        m->fsid = fsid;
+        m->epoch = epoch;
+        m->num_mon = num_mon;
+        ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+        for (i = 0; i < num_mon; i++)
+                ceph_decode_addr(&m->mon_inst[i].addr);
+        dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
+             m->num_mon);
+        for (i = 0; i < m->num_mon; i++)
+                dout("monmap_decode  mon%d is %s\n", i,
+                     pr_addr(&m->mon_inst[i].addr.in_addr));
+        return m;
+bad:
+        dout("monmap_decode failed with %d\n", err);
+        kfree(m);
+        return ERR_PTR(err);
+}
+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+        int i;
+        for (i = 0; i < m->num_mon; i++)
+                if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+                        return 1;
+        return 0;
+}
+/*
+ * Send an auth request.
+ */
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+        monc->pending_auth = 1;
+        monc->m_auth->front.iov_len = len;
+        monc->m_auth->hdr.front_len = cpu_to_le32(len);
+        ceph_msg_get(monc->m_auth);  /* keep our ref */
+        ceph_con_send(monc->con, monc->m_auth);
+}
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+        if (monc->con) {
+                dout("__close_session closing mon%d\n", monc->cur_mon);
+                ceph_con_revoke(monc->con, monc->m_auth);
+                ceph_con_close(monc->con);
+                monc->cur_mon = -1;
+                monc->pending_auth = 0;
+                ceph_auth_reset(monc->auth);
+        }
+}
+/*
+ * Open a session with a (new) monitor.
+ */
+static int __open_session(struct ceph_mon_client *monc)
+{
+        char r;
+        int ret;
+        if (monc->cur_mon < 0) {
+                get_random_bytes(&r, 1);
+                monc->cur_mon = r % monc->monmap->num_mon;
+                dout("open_session num=%d r=%d -> mon%d\n",
+                     monc->monmap->num_mon, r, monc->cur_mon);
+                monc->sub_sent = 0;
+                monc->sub_renew_after = jiffies;  /* i.e., expired */
+                monc->want_next_osdmap = !!monc->want_next_osdmap;
+                dout("open_session mon%d opening\n", monc->cur_mon);
+                monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
+                monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
+                ceph_con_open(monc->con,
+                              &monc->monmap->mon_inst[monc->cur_mon].addr);
+                /* initiatiate authentication handshake */
+                ret = ceph_auth_build_hello(monc->auth,
+                                            monc->m_auth->front.iov_base,
+                                            monc->m_auth->front_max);
+                __send_prepared_auth_request(monc, ret);
+        } else {
+                dout("open_session mon%d already open\n", monc->cur_mon);
+        }
+        return 0;
+}
+static bool __sub_expired(struct ceph_mon_client *monc)
+{
+        return time_after_eq(jiffies, monc->sub_renew_after);
+}
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+        unsigned delay;
+        if (monc->cur_mon < 0 || __sub_expired(monc))
+                delay = 10 * HZ;
+        else
+                delay = 20 * HZ;
+        dout("__schedule_delayed after %u\n", delay);
+        schedule_delayed_work(&monc->delayed_work, delay);
+}
+/*
+ * Send subscribe request for mdsmap and/or osdmap.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+        dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
+             (unsigned)monc->sub_sent, __sub_expired(monc),
+             monc->want_next_osdmap);
+        if ((__sub_expired(monc) && !monc->sub_sent) ||
+            monc->want_next_osdmap == 1) {
+                struct ceph_msg *msg;
+                struct ceph_mon_subscribe_item *i;
+                void *p, *end;
+                msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
+                if (!msg)
+                        return;
+                p = msg->front.iov_base;
+                end = p + msg->front.iov_len;
+                dout("__send_subscribe to 'mdsmap' %u+\n",
+                     (unsigned)monc->have_mdsmap);
+                if (monc->want_next_osdmap) {
+                        dout("__send_subscribe to 'osdmap' %u\n",
+                             (unsigned)monc->have_osdmap);
+                        ceph_encode_32(&p, 3);
+                        ceph_encode_string(&p, end, "osdmap", 6);
+                        i = p;
+                        i->have = cpu_to_le64(monc->have_osdmap);
+                        i->onetime = 1;
+                        p += sizeof(*i);
+                        monc->want_next_osdmap = 2;  /* requested */
+                } else {
+                        ceph_encode_32(&p, 2);
+                }
+                ceph_encode_string(&p, end, "mdsmap", 6);
+                i = p;
+                i->have = cpu_to_le64(monc->have_mdsmap);
+                i->onetime = 0;
+                p += sizeof(*i);
+                ceph_encode_string(&p, end, "monmap", 6);
+                i = p;
+                i->have = 0;
+                i->onetime = 0;
+                p += sizeof(*i);
+                msg->front.iov_len = p - msg->front.iov_base;
+                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+                ceph_con_send(monc->con, msg);
+                monc->sub_sent = jiffies | 1;  /* never 0 */
+        }
+}
+static void handle_subscribe_ack(struct ceph_mon_client *monc,
+                                 struct ceph_msg *msg)
+{
+        unsigned seconds;
+        struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+        if (msg->front.iov_len < sizeof(*h))
+                goto bad;
+        seconds = le32_to_cpu(h->duration);
+        mutex_lock(&monc->mutex);
+        if (monc->hunting) {
+                pr_info("mon%d %s session established\n",
+                        monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
+                monc->hunting = false;
+        }
+        dout("handle_subscribe_ack after %d seconds\n", seconds);
+        monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
+        monc->sub_sent = 0;
+        mutex_unlock(&monc->mutex);
+        return;
+bad:
+        pr_err("got corrupt subscribe-ack msg\n");
+        ceph_msg_dump(msg);
+}
+/*
+ * Keep track of which maps we have
+ */
+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+{
+        mutex_lock(&monc->mutex);
+        monc->have_mdsmap = got;
+        mutex_unlock(&monc->mutex);
+        return 0;
+}
+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+{
+        mutex_lock(&monc->mutex);
+        monc->have_osdmap = got;
+        monc->want_next_osdmap = 0;
+        mutex_unlock(&monc->mutex);
+        return 0;
+}
+/*
+ * Register interest in the next osdmap
+ */
+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+{
+        dout("request_next_osdmap have %u\n", monc->have_osdmap);
+        mutex_lock(&monc->mutex);
+        if (!monc->want_next_osdmap)
+                monc->want_next_osdmap = 1;
+        if (monc->want_next_osdmap < 2)
+                __send_subscribe(monc);
+        mutex_unlock(&monc->mutex);
+}
+/*
+ *
+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
+{
+        if (!monc->con) {
+                monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
+                if (!monc->con)
+                        return -ENOMEM;
+                ceph_con_init(monc->client->msgr, monc->con);
+                monc->con->private = monc;
+                monc->con->ops = &mon_con_ops;
+        }
+        mutex_lock(&monc->mutex);
+        __open_session(monc);
+        __schedule_delayed(monc);
+        mutex_unlock(&monc->mutex);
+        return 0;
+}
+/*
+ * The monitor responds with mount ack indicate mount success.  The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+                                 struct ceph_msg *msg)
+{
+        struct ceph_client *client = monc->client;
+        struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+        void *p, *end;
+        mutex_lock(&monc->mutex);
+        dout("handle_monmap\n");
+        p = msg->front.iov_base;
+        end = p + msg->front.iov_len;
+        monmap = ceph_monmap_decode(p, end);
+        if (IS_ERR(monmap)) {
+                pr_err("problem decoding monmap, %d\n",
+                       (int)PTR_ERR(monmap));
+                goto out;
+        }
+        if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+                kfree(monmap);
+                goto out;
+        }
+        client->monc.monmap = monmap;
+        kfree(old);
+out:
+        mutex_unlock(&monc->mutex);
+        wake_up(&client->auth_wq);
+}
+/*
+ * statfs
+ */
+static struct ceph_mon_statfs_request *__lookup_statfs(
+        struct ceph_mon_client *monc, u64 tid)
+{
+        struct ceph_mon_statfs_request *req;
+        struct rb_node *n = monc->statfs_request_tree.rb_node;
+        while (n) {
+                req = rb_entry(n, struct ceph_mon_statfs_request, node);
+                if (tid < req->tid)
+                        n = n->rb_left;
+                else if (tid > req->tid)
+                        n = n->rb_right;
+                else
+                        return req;
+        }
+        return NULL;
+}
+static void __insert_statfs(struct ceph_mon_client *monc,
+                            struct ceph_mon_statfs_request *new)
+{
+        struct rb_node **p = &monc->statfs_request_tree.rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_mon_statfs_request *req = NULL;
+        while (*p) {
+                parent = *p;
+                req = rb_entry(parent, struct ceph_mon_statfs_request, node);
+                if (new->tid < req->tid)
+                        p = &(*p)->rb_left;
+                else if (new->tid > req->tid)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&new->node, parent, p);
+        rb_insert_color(&new->node, &monc->statfs_request_tree);
+}
+static void handle_statfs_reply(struct ceph_mon_client *monc,
+                                struct ceph_msg *msg)
+{
+        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+        u64 tid;
+        if (msg->front.iov_len != sizeof(*reply))
+                goto bad;
+        tid = le64_to_cpu(msg->hdr.tid);
+        dout("handle_statfs_reply %p tid %llu\n", msg, tid);
+        mutex_lock(&monc->mutex);
+        req = __lookup_statfs(monc, tid);
+        if (req) {
+                *req->buf = reply->st;
+                req->result = 0;
+        }
+        mutex_unlock(&monc->mutex);
+        if (req)
+                complete(&req->completion);
+        return;
+bad:
+        pr_err("corrupt statfs reply, no tid\n");
+        ceph_msg_dump(msg);
+}
+/*
+ * (re)send a statfs request
+ */
+static int send_statfs(struct ceph_mon_client *monc,
+                       struct ceph_mon_statfs_request *req)
+{
+        struct ceph_msg *msg;
+        struct ceph_mon_statfs *h;
+        dout("send_statfs tid %llu\n", req->tid);
+        msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
+        if (IS_ERR(msg))
+                return PTR_ERR(msg);
+        req->request = msg;
+        msg->hdr.tid = cpu_to_le64(req->tid);
+        h = msg->front.iov_base;
+        h->monhdr.have_version = 0;
+        h->monhdr.session_mon = cpu_to_le16(-1);
+        h->monhdr.session_mon_tid = 0;
+        h->fsid = monc->monmap->fsid;
+        ceph_con_send(monc->con, msg);
+        return 0;
+}
+/*
+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+{
+        struct ceph_mon_statfs_request req;
+        int err;
+        req.buf = buf;
+        init_completion(&req.completion);
+        /* allocate memory for reply */
+        err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
+        if (err)
+                return err;
+        /* register request */
+        mutex_lock(&monc->mutex);
+        req.tid = ++monc->last_tid;
+        req.last_attempt = jiffies;
+        req.delay = BASE_DELAY_INTERVAL;
+        __insert_statfs(monc, &req);
+        monc->num_statfs_requests++;
+        mutex_unlock(&monc->mutex);
+        /* send request and wait */
+        err = send_statfs(monc, &req);
+        if (!err)
+                err = wait_for_completion_interruptible(&req.completion);
+        mutex_lock(&monc->mutex);
+        rb_erase(&req.node, &monc->statfs_request_tree);
+        monc->num_statfs_requests--;
+        ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
+        mutex_unlock(&monc->mutex);
+        if (!err)
+                err = req.result;
+        return err;
+}
+/*
+ * Resend pending statfs requests.
+ */
+static void __resend_statfs(struct ceph_mon_client *monc)
+{
+        struct ceph_mon_statfs_request *req;
+        struct rb_node *p;
+        for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
+                req = rb_entry(p, struct ceph_mon_statfs_request, node);
+                send_statfs(monc, req);
+        }
+}
+/*
+ * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM).  And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+        struct ceph_mon_client *monc =
+                container_of(work, struct ceph_mon_client, delayed_work.work);
+        dout("monc delayed_work\n");
+        mutex_lock(&monc->mutex);
+        if (monc->hunting) {
+                __close_session(monc);
+                __open_session(monc);  /* continue hunting */
+        } else {
+                ceph_con_keepalive(monc->con);
+                __validate_auth(monc);
+                if (monc->auth->ops->is_authenticated(monc->auth))
+                        __send_subscribe(monc);
+        }
+        __schedule_delayed(monc);
+        mutex_unlock(&monc->mutex);
+}
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+        struct ceph_mount_args *args = monc->client->mount_args;
+        struct ceph_entity_addr *mon_addr = args->mon_addr;
+        int num_mon = args->num_mon;
+        int i;
+        /* build initial monmap */
+        monc->monmap = kzalloc(sizeof(*monc->monmap) +
+                               num_mon*sizeof(monc->monmap->mon_inst[0]),
+                               GFP_KERNEL);
+        if (!monc->monmap)
+                return -ENOMEM;
+        for (i = 0; i < num_mon; i++) {
+                monc->monmap->mon_inst[i].addr = mon_addr[i];
+                monc->monmap->mon_inst[i].addr.nonce = 0;
+                monc->monmap->mon_inst[i].name.type =
+                        CEPH_ENTITY_TYPE_MON;
+                monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+        }
+        monc->monmap->num_mon = num_mon;
+        monc->have_fsid = false;
+        /* release addr memory */
+        kfree(args->mon_addr);
+        args->mon_addr = NULL;
+        args->num_mon = 0;
+        return 0;
+}
+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{
+        int err = 0;
+        dout("init\n");
+        memset(monc, 0, sizeof(*monc));
+        monc->client = cl;
+        monc->monmap = NULL;
+        mutex_init(&monc->mutex);
+        err = build_initial_monmap(monc);
+        if (err)
+                goto out;
+        monc->con = NULL;
+        /* authentication */
+        monc->auth = ceph_auth_init(cl->mount_args->name,
+                                    cl->mount_args->secret);
+        if (IS_ERR(monc->auth))
+                return PTR_ERR(monc->auth);
+        monc->auth->want_keys =
+                CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+                CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+        /* msg pools */
+        err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
+                               sizeof(struct ceph_mon_subscribe_ack), 1, false);
+        if (err < 0)
+                goto out_monmap;
+        err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
+                                sizeof(struct ceph_mon_statfs_reply), 0, false);
+        if (err < 0)
+                goto out_pool1;
+        err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
+        if (err < 0)
+                goto out_pool2;
+        monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+        monc->pending_auth = 0;
+        if (IS_ERR(monc->m_auth)) {
+                err = PTR_ERR(monc->m_auth);
+                monc->m_auth = NULL;
+                goto out_pool3;
+        }
+        monc->cur_mon = -1;
+        monc->hunting = true;
+        monc->sub_renew_after = jiffies;
+        monc->sub_sent = 0;
+        INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+        monc->statfs_request_tree = RB_ROOT;
+        monc->num_statfs_requests = 0;
+        monc->last_tid = 0;
+        monc->have_mdsmap = 0;
+        monc->have_osdmap = 0;
+        monc->want_next_osdmap = 1;
+        return 0;
+out_pool3:
+        ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+out_pool2:
+        ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+out_pool1:
+        ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+out_monmap:
+        kfree(monc->monmap);
+out:
+        return err;
+}
+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+        dout("stop\n");
+        cancel_delayed_work_sync(&monc->delayed_work);
+        mutex_lock(&monc->mutex);
+        __close_session(monc);
+        if (monc->con) {
+                monc->con->private = NULL;
+                monc->con->ops->put(monc->con);
+                monc->con = NULL;
+        }
+        mutex_unlock(&monc->mutex);
+        ceph_auth_destroy(monc->auth);
+        ceph_msg_put(monc->m_auth);
+        ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+        ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+        ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+        kfree(monc->monmap);
+}
+static void handle_auth_reply(struct ceph_mon_client *monc,
+                              struct ceph_msg *msg)
+{
+        int ret;
+        mutex_lock(&monc->mutex);
+        monc->pending_auth = 0;
+        ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+                                     msg->front.iov_len,
+                                     monc->m_auth->front.iov_base,
+                                     monc->m_auth->front_max);
+        if (ret < 0) {
+                monc->client->auth_err = ret;
+                wake_up(&monc->client->auth_wq);
+        } else if (ret > 0) {
+                __send_prepared_auth_request(monc, ret);
+        } else if (monc->auth->ops->is_authenticated(monc->auth)) {
+                dout("authenticated, starting session\n");
+                monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+                monc->client->msgr->inst.name.num = monc->auth->global_id;
+                __send_subscribe(monc);
+                __resend_statfs(monc);
+        }
+        mutex_unlock(&monc->mutex);
+}
+static int __validate_auth(struct ceph_mon_client *monc)
+{
+        int ret;
+        if (monc->pending_auth)
+                return 0;
+        ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
+                              monc->m_auth->front_max);
+        if (ret <= 0)
+                return ret; /* either an error, or no need to authenticate */
+        __send_prepared_auth_request(monc, ret);
+        return 0;
+}
+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
+{
+        int ret;
+        mutex_lock(&monc->mutex);
+        ret = __validate_auth(monc);
+        mutex_unlock(&monc->mutex);
+        return ret;
+}
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        struct ceph_mon_client *monc = con->private;
+        int type = le16_to_cpu(msg->hdr.type);
+        if (!monc)
+                return;
+        switch (type) {
+        case CEPH_MSG_AUTH_REPLY:
+                handle_auth_reply(monc, msg);
+                break;
+        case CEPH_MSG_MON_SUBSCRIBE_ACK:
+                handle_subscribe_ack(monc, msg);
+                break;
+        case CEPH_MSG_STATFS_REPLY:
+                handle_statfs_reply(monc, msg);
+                break;
+        case CEPH_MSG_MON_MAP:
+                ceph_monc_handle_map(monc, msg);
+                break;
+        case CEPH_MSG_MDS_MAP:
+                ceph_mdsc_handle_map(&monc->client->mdsc, msg);
+                break;
+        case CEPH_MSG_OSD_MAP:
+                ceph_osdc_handle_map(&monc->client->osdc, msg);
+                break;
+        default:
+                pr_err("received unknown message type %d %s\n", type,
+                       ceph_msg_type_name(type));
+        }
+        ceph_msg_put(msg);
+}
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
+                                      struct ceph_msg_header *hdr,
+                                      int *skip)
+{
+        struct ceph_mon_client *monc = con->private;
+        int type = le16_to_cpu(hdr->type);
+        int front_len = le32_to_cpu(hdr->front_len);
+        struct ceph_msg *m = NULL;
+        *skip = 0;
+        switch (type) {
+        case CEPH_MSG_MON_SUBSCRIBE_ACK:
+                m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
+                break;
+        case CEPH_MSG_STATFS_REPLY:
+                m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
+                break;
+        case CEPH_MSG_AUTH_REPLY:
+                m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
+                break;
+        case CEPH_MSG_MON_MAP:
+        case CEPH_MSG_MDS_MAP:
+        case CEPH_MSG_OSD_MAP:
+                m = ceph_msg_new(type, front_len, 0, 0, NULL);
+                break;
+        }
+        if (!m) {
+                pr_info("alloc_msg unknown type %d\n", type);
+                *skip = 1;
+        }
+        return m;
+}
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+        struct ceph_mon_client *monc = con->private;
+        if (!monc)
+                return;
+        dout("mon_fault\n");
+        mutex_lock(&monc->mutex);
+        if (!con->private)
+                goto out;
+        if (monc->con && !monc->hunting)
+                pr_info("mon%d %s session lost, "
+                        "hunting for new mon\n", monc->cur_mon,
+                        pr_addr(&monc->con->peer_addr.in_addr));
+        __close_session(monc);
+        if (!monc->hunting) {
+                /* start hunting */
+                monc->hunting = true;
+                __open_session(monc);
+        } else {
+                /* already hunting, let's wait a bit */
+                __schedule_delayed(monc);
+        }
+out:
+        mutex_unlock(&monc->mutex);
+}
+const static struct ceph_connection_operations mon_con_ops = {
+        .get = ceph_con_get,
+        .put = ceph_con_put,
+        .dispatch = dispatch,
+        .fault = mon_fault,
+        .alloc_msg = mon_alloc_msg,
+};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..b958ad5afa06
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
+#ifndef _FS_CEPH_MON_CLIENT_H
+#define _FS_CEPH_MON_CLIENT_H
+#include <linux/completion.h>
+#include <linux/rbtree.h>
+#include "messenger.h"
+#include "msgpool.h"
+struct ceph_client;
+struct ceph_mount_args;
+struct ceph_auth_client;
+/*
+ * The monitor map enumerates the set of all monitors.
+ */
+struct ceph_monmap {
+        struct ceph_fsid fsid;
+        u32 epoch;
+        u32 num_mon;
+        struct ceph_entity_inst mon_inst[0];
+};
+struct ceph_mon_client;
+struct ceph_mon_statfs_request;
+/*
+ * Generic mechanism for resending monitor requests.
+ */
+typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
+                                         int newmon);
+/* a pending monitor request */
+struct ceph_mon_request {
+        struct ceph_mon_client *monc;
+        struct delayed_work delayed_work;
+        unsigned long delay;
+        ceph_monc_request_func_t do_request;
+};
+/*
+ * statfs() is done a bit differently because we need to get data back
+ * to the caller
+ */
+struct ceph_mon_statfs_request {
+        u64 tid;
+        struct rb_node node;
+        int result;
+        struct ceph_statfs *buf;
+        struct completion completion;
+        unsigned long last_attempt, delay; /* jiffies */
+        struct ceph_msg *request;  /* original request */
+};
+struct ceph_mon_client {
+        struct ceph_client *client;
+        struct ceph_monmap *monmap;
+        struct mutex mutex;
+        struct delayed_work delayed_work;
+        struct ceph_auth_client *auth;
+        struct ceph_msg *m_auth;
+        int pending_auth;
+        bool hunting;
+        int cur_mon;                       /* last monitor i contacted */
+        unsigned long sub_sent, sub_renew_after;
+        struct ceph_connection *con;
+        bool have_fsid;
+        /* msg pools */
+        struct ceph_msgpool msgpool_subscribe_ack;
+        struct ceph_msgpool msgpool_statfs_reply;
+        struct ceph_msgpool msgpool_auth_reply;
+        /* pending statfs requests */
+        struct rb_root statfs_request_tree;
+        int num_statfs_requests;
+        u64 last_tid;
+        /* mds/osd map */
+        int want_next_osdmap; /* 1 = want, 2 = want+asked */
+        u32 have_osdmap, have_mdsmap;
+#ifdef CONFIG_DEBUG_FS
+        struct dentry *debugfs_file;
+#endif
+};
+extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
+extern int ceph_monmap_contains(struct ceph_monmap *m,
+                                struct ceph_entity_addr *addr);
+extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
+extern void ceph_monc_stop(struct ceph_mon_client *monc);
+/*
+ * The model here is to indicate that we need a new map of at least
+ * epoch @want, and also call in when we receive a map.  We will
+ * periodically rerequest the map from the monitor cluster until we
+ * get what we want.
+ */
+extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
+extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
+extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
+extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
+                               struct ceph_statfs *buf);
+extern int ceph_monc_open_session(struct ceph_mon_client *monc);
+extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
+#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..ca3b44a89f2d
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
+#include "ceph_debug.h"
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include "msgpool.h"
+/*
+ * We use msg pools to preallocate memory for messages we expect to
+ * receive over the wire, to avoid getting ourselves into OOM
+ * conditions at unexpected times.  We take use a few different
+ * strategies:
+ *
+ *  - for request/response type interactions, we preallocate the
+ * memory needed for the response when we generate the request.
+ *
+ *  - for messages we can receive at any time from the MDS, we preallocate
+ * a pool of messages we can re-use.
+ *
+ *  - for writeback, we preallocate some number of messages to use for
+ * requests and their replies, so that we always make forward
+ * progress.
+ *
+ * The msgpool behaves like a mempool_t, but keeps preallocated
+ * ceph_msgs strung together on a list_head instead of using a pointer
+ * vector.  This avoids vector reallocation when we adjust the number
+ * of preallocated items (which happens frequently).
+ */
+/*
+ * Allocate or release as necessary to meet our target pool size.
+ */
+static int __fill_msgpool(struct ceph_msgpool *pool)
+{
+        struct ceph_msg *msg;
+        while (pool->num < pool->min) {
+                dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
+                     pool->min);
+                spin_unlock(&pool->lock);
+                msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
+                spin_lock(&pool->lock);
+                if (IS_ERR(msg))
+                        return PTR_ERR(msg);
+                msg->pool = pool;
+                list_add(&msg->list_head, &pool->msgs);
+                pool->num++;
+        }
+        while (pool->num > pool->min) {
+                msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
+                dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
+                     pool->min, msg);
+                list_del_init(&msg->list_head);
+                pool->num--;
+                ceph_msg_kfree(msg);
+        }
+        return 0;
+}
+int ceph_msgpool_init(struct ceph_msgpool *pool,
+                      int front_len, int min, bool blocking)
+{
+        int ret;
+        dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
+        spin_lock_init(&pool->lock);
+        pool->front_len = front_len;
+        INIT_LIST_HEAD(&pool->msgs);
+        pool->num = 0;
+        pool->min = min;
+        pool->blocking = blocking;
+        init_waitqueue_head(&pool->wait);
+        spin_lock(&pool->lock);
+        ret = __fill_msgpool(pool);
+        spin_unlock(&pool->lock);
+        return ret;
+}
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+        dout("msgpool_destroy %p\n", pool);
+        spin_lock(&pool->lock);
+        pool->min = 0;
+        __fill_msgpool(pool);
+        spin_unlock(&pool->lock);
+}
+int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+{
+        int ret;
+        spin_lock(&pool->lock);
+        dout("msgpool_resv %p delta %d\n", pool, delta);
+        pool->min += delta;
+        ret = __fill_msgpool(pool);
+        spin_unlock(&pool->lock);
+        return ret;
+}
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
+{
+        wait_queue_t wait;
+        struct ceph_msg *msg;
+        if (front_len && front_len > pool->front_len) {
+                pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
+                       pool, front_len, pool->front_len);
+                WARN_ON(1);
+                /* try to alloc a fresh message */
+                msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+                if (!IS_ERR(msg))
+                        return msg;
+        }
+        if (!front_len)
+                front_len = pool->front_len;
+        if (pool->blocking) {
+                /* mempool_t behavior; first try to alloc */
+                msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+                if (!IS_ERR(msg))
+                        return msg;
+        }
+        while (1) {
+                spin_lock(&pool->lock);
+                if (likely(pool->num)) {
+                        msg = list_entry(pool->msgs.next, struct ceph_msg,
+                                         list_head);
+                        list_del_init(&msg->list_head);
+                        pool->num--;
+                        dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
+                             pool->num, pool->min);
+                        spin_unlock(&pool->lock);
+                        return msg;
+                }
+                pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
+                       pool->min, pool->blocking ? "waiting" : "may fail");
+                spin_unlock(&pool->lock);
+                if (!pool->blocking) {
+                        WARN_ON(1);
+                        /* maybe we can allocate it now? */
+                        msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+                        if (!IS_ERR(msg))
+                                return msg;
+                        pr_err("msgpool_get %p empty + alloc failed\n", pool);
+                        return ERR_PTR(-ENOMEM);
+                }
+                init_wait(&wait);
+                prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+                schedule();
+                finish_wait(&pool->wait, &wait);
+        }
+}
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+        spin_lock(&pool->lock);
+        if (pool->num < pool->min) {
+                /* reset msg front_len; user may have changed it */
+                msg->front.iov_len = pool->front_len;
+                msg->hdr.front_len = cpu_to_le32(pool->front_len);
+                kref_set(&msg->kref, 1);  /* retake a single ref */
+                list_add(&msg->list_head, &pool->msgs);
+                pool->num++;
+                dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
+                     pool->num, pool->min);
+                spin_unlock(&pool->lock);
+                wake_up(&pool->wait);
+        } else {
+                dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
+                     pool->num, pool->min);
+                spin_unlock(&pool->lock);
+                ceph_msg_kfree(msg);
+        }
+}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..bc834bfcd720
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
+#ifndef _FS_CEPH_MSGPOOL
+#define _FS_CEPH_MSGPOOL
+#include "messenger.h"
+/*
+ * we use memory pools for preallocating messages we may receive, to
+ * avoid unexpected OOM conditions.
+ */
+struct ceph_msgpool {
+        spinlock_t lock;
+        int front_len;          /* preallocated payload size */
+        struct list_head msgs;  /* msgs in the pool; each has 1 ref */
+        int num, min;           /* cur, min # msgs in the pool */
+        bool blocking;
+        wait_queue_head_t wait;
+};
+extern int ceph_msgpool_init(struct ceph_msgpool *pool,
+                             int front_len, int size, bool blocking);
+extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
+extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
+extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
+                                         int front_len);
+extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
+#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..8aaab414f3f8
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
+#ifndef __MSGR_H
+#define __MSGR_H
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+#define CEPH_MON_PORT    6789  /* default monitor port */
+/*
+ * client-side processes will try to bind to ports in this
+ * range, simply for the benefit of tools like nmap or wireshark
+ * that would like to identify the protocol.
+ */
+#define CEPH_PORT_FIRST  6789
+#define CEPH_PORT_START  6800  /* non-monitors start here */
+#define CEPH_PORT_LAST   6900
+/*
+ * tcp connection banner.  include a protocol version. and adjust
+ * whenever the wire protocol changes.  try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_MAX_LEN 30
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+       return (__s32)a - (__s32)b;
+}
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+        __u8 type;      /* CEPH_ENTITY_TYPE_* */
+        __le64 num;
+} __attribute__ ((packed));
+#define CEPH_ENTITY_TYPE_MON    0x01
+#define CEPH_ENTITY_TYPE_MDS    0x02
+#define CEPH_ENTITY_TYPE_OSD    0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_ADMIN  0x10
+#define CEPH_ENTITY_TYPE_AUTH   0x20
+#define CEPH_ENTITY_TYPE_ANY    0xFF
+extern const char *ceph_entity_type_name(int type);
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+        __le32 type;
+        __le32 nonce;  /* unique id for process (e.g. pid) */
+        struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+struct ceph_entity_inst {
+        struct ceph_entity_name name;
+        struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
+                                          incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
+                                          with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
+                                          with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
+#define CEPH_MSGR_TAG_MSG           7  /* message */
+#define CEPH_MSGR_TAG_ACK           8  /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+        __le64 features;     /* supported feature bits */
+        __le32 host_type;    /* CEPH_ENTITY_TYPE_* */
+        __le32 global_seq;   /* count connections initiated by this host */
+        __le32 connect_seq;  /* count connections initiated in this session */
+        __le32 protocol_version;
+        __le32 authorizer_protocol;
+        __le32 authorizer_len;
+        __u8  flags;         /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+struct ceph_msg_connect_reply {
+        __u8 tag;
+        __le64 features;     /* feature bits for this session */
+        __le32 global_seq;
+        __le32 connect_seq;
+        __le32 protocol_version;
+        __le32 authorizer_len;
+        __u8 flags;
+} __attribute__ ((packed));
+#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
+/*
+ * message header
+ */
+struct ceph_msg_header {
+        __le64 seq;       /* message seq# for this session */
+        __le64 tid;       /* transaction id */
+        __le16 type;      /* message type */
+        __le16 priority;  /* priority.  higher value == higher priority */
+        __le16 version;   /* version of message encoding */
+        __le32 front_len; /* bytes in main payload */
+        __le32 middle_len;/* bytes in middle payload */
+        __le32 data_len;  /* bytes of data payload */
+        __le16 data_off;  /* sender: include full offset;
+                             receiver: mask against ~PAGE_MASK */
+        struct ceph_entity_inst src, orig_src;
+        __le32 reserved;
+        __le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+#define CEPH_MSG_PRIO_LOW     64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH    196
+#define CEPH_MSG_PRIO_HIGHEST 255
+/*
+ * follows data payload
+ */
+struct ceph_msg_footer {
+        __le32 front_crc, middle_crc, data_crc;
+        __u8 flags;
+} __attribute__ ((packed));
+#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
+#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..3514f71ff85f
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1564 @@
+#include "ceph_debug.h"
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "super.h"
+#include "osd_client.h"
+#include "messenger.h"
+#include "decode.h"
+#include "auth.h"
+#define OSD_OP_FRONT_LEN        4096
+#define OSD_OPREPLY_FRONT_LEN   512
+const static struct ceph_connection_operations osd_con_ops;
+static int __kick_requests(struct ceph_osd_client *osdc,
+                          struct ceph_osd *kickosd);
+static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly.  shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static void calc_layout(struct ceph_osd_client *osdc,
+                        struct ceph_vino vino, struct ceph_file_layout *layout,
+                        u64 off, u64 *plen,
+                        struct ceph_osd_request *req)
+{
+        struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+        struct ceph_osd_op *op = (void *)(reqhead + 1);
+        u64 orig_len = *plen;
+        u64 objoff, objlen;    /* extent in object */
+        u64 bno;
+        reqhead->snapid = cpu_to_le64(vino.snap);
+        /* object extent? */
+        ceph_calc_file_object_mapping(layout, off, plen, &bno,
+                                      &objoff, &objlen);
+        if (*plen < orig_len)
+                dout(" skipping last %llu, final file extent %llu~%llu\n",
+                     orig_len - *plen, off, *plen);
+        sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
+        req->r_oid_len = strlen(req->r_oid);
+        op->extent.offset = cpu_to_le64(objoff);
+        op->extent.length = cpu_to_le64(objlen);
+        req->r_num_pages = calc_pages_for(off, *plen);
+        dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
+             req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
+}
+/*
+ * requests
+ */
+void ceph_osdc_release_request(struct kref *kref)
+{
+        struct ceph_osd_request *req = container_of(kref,
+                                                    struct ceph_osd_request,
+                                                    r_kref);
+        if (req->r_request)
+                ceph_msg_put(req->r_request);
+        if (req->r_reply)
+                ceph_msg_put(req->r_reply);
+        if (req->r_con_filling_msg) {
+                dout("release_request revoking pages %p from con %p\n",
+                     req->r_pages, req->r_con_filling_msg);
+                ceph_con_revoke_message(req->r_con_filling_msg,
+                                      req->r_reply);
+                ceph_con_put(req->r_con_filling_msg);
+        }
+        if (req->r_own_pages)
+                ceph_release_page_vector(req->r_pages,
+                                         req->r_num_pages);
+        ceph_put_snap_context(req->r_snapc);
+        if (req->r_mempool)
+                mempool_free(req, req->r_osdc->req_mempool);
+        else
+                kfree(req);
+}
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately.  (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+                                               struct ceph_file_layout *layout,
+                                               struct ceph_vino vino,
+                                               u64 off, u64 *plen,
+                                               int opcode, int flags,
+                                               struct ceph_snap_context *snapc,
+                                               int do_sync,
+                                               u32 truncate_seq,
+                                               u64 truncate_size,
+                                               struct timespec *mtime,
+                                               bool use_mempool, int num_reply)
+{
+        struct ceph_osd_request *req;
+        struct ceph_msg *msg;
+        struct ceph_osd_request_head *head;
+        struct ceph_osd_op *op;
+        void *p;
+        int num_op = 1 + do_sync;
+        size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+        int i;
+        if (use_mempool) {
+                req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
+                memset(req, 0, sizeof(*req));
+        } else {
+                req = kzalloc(sizeof(*req), GFP_NOFS);
+        }
+        if (req == NULL)
+                return ERR_PTR(-ENOMEM);
+        req->r_osdc = osdc;
+        req->r_mempool = use_mempool;
+        kref_init(&req->r_kref);
+        init_completion(&req->r_completion);
+        init_completion(&req->r_safe_completion);
+        INIT_LIST_HEAD(&req->r_unsafe_item);
+        req->r_flags = flags;
+        WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+        /* create reply message */
+        if (use_mempool)
+                msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+        else
+                msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
+                                   OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
+        if (IS_ERR(msg)) {
+                ceph_osdc_put_request(req);
+                return ERR_PTR(PTR_ERR(msg));
+        }
+        req->r_reply = msg;
+        /* create request message; allow space for oid */
+        msg_size += 40;
+        if (snapc)
+                msg_size += sizeof(u64) * snapc->num_snaps;
+        if (use_mempool)
+                msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+        else
+                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
+        if (IS_ERR(msg)) {
+                ceph_osdc_put_request(req);
+                return ERR_PTR(PTR_ERR(msg));
+        }
+        msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
+        memset(msg->front.iov_base, 0, msg->front.iov_len);
+        head = msg->front.iov_base;
+        op = (void *)(head + 1);
+        p = (void *)(op + num_op);
+        req->r_request = msg;
+        req->r_snapc = ceph_get_snap_context(snapc);
+        head->client_inc = cpu_to_le32(1); /* always, for now. */
+        head->flags = cpu_to_le32(flags);
+        if (flags & CEPH_OSD_FLAG_WRITE)
+                ceph_encode_timespec(&head->mtime, mtime);
+        head->num_ops = cpu_to_le16(num_op);
+        op->op = cpu_to_le16(opcode);
+        /* calculate max write size */
+        calc_layout(osdc, vino, layout, off, plen, req);
+        req->r_file_layout = *layout;  /* keep a copy */
+        if (flags & CEPH_OSD_FLAG_WRITE) {
+                req->r_request->hdr.data_off = cpu_to_le16(off);
+                req->r_request->hdr.data_len = cpu_to_le32(*plen);
+                op->payload_len = cpu_to_le32(*plen);
+        }
+        op->extent.truncate_size = cpu_to_le64(truncate_size);
+        op->extent.truncate_seq = cpu_to_le32(truncate_seq);
+        /* fill in oid */
+        head->object_len = cpu_to_le32(req->r_oid_len);
+        memcpy(p, req->r_oid, req->r_oid_len);
+        p += req->r_oid_len;
+        if (do_sync) {
+                op++;
+                op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
+        }
+        if (snapc) {
+                head->snap_seq = cpu_to_le64(snapc->seq);
+                head->num_snaps = cpu_to_le32(snapc->num_snaps);
+                for (i = 0; i < snapc->num_snaps; i++) {
+                        put_unaligned_le64(snapc->snaps[i], p);
+                        p += sizeof(u64);
+                }
+        }
+        BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+        msg_size = p - msg->front.iov_base;
+        msg->front.iov_len = msg_size;
+        msg->hdr.front_len = cpu_to_le32(msg_size);
+        return req;
+}
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+static void __insert_request(struct ceph_osd_client *osdc,
+                             struct ceph_osd_request *new)
+{
+        struct rb_node **p = &osdc->requests.rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_osd_request *req = NULL;
+        while (*p) {
+                parent = *p;
+                req = rb_entry(parent, struct ceph_osd_request, r_node);
+                if (new->r_tid < req->r_tid)
+                        p = &(*p)->rb_left;
+                else if (new->r_tid > req->r_tid)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&new->r_node, parent, p);
+        rb_insert_color(&new->r_node, &osdc->requests);
+}
+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
+                                                 u64 tid)
+{
+        struct ceph_osd_request *req;
+        struct rb_node *n = osdc->requests.rb_node;
+        while (n) {
+                req = rb_entry(n, struct ceph_osd_request, r_node);
+                if (tid < req->r_tid)
+                        n = n->rb_left;
+                else if (tid > req->r_tid)
+                        n = n->rb_right;
+                else
+                        return req;
+        }
+        return NULL;
+}
+static struct ceph_osd_request *
+__lookup_request_ge(struct ceph_osd_client *osdc,
+                    u64 tid)
+{
+        struct ceph_osd_request *req;
+        struct rb_node *n = osdc->requests.rb_node;
+        while (n) {
+                req = rb_entry(n, struct ceph_osd_request, r_node);
+                if (tid < req->r_tid) {
+                        if (!n->rb_left)
+                                return req;
+                        n = n->rb_left;
+                } else if (tid > req->r_tid) {
+                        n = n->rb_right;
+                } else {
+                        return req;
+                }
+        }
+        return NULL;
+}
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_reset(struct ceph_connection *con)
+{
+        struct ceph_osd *osd = con->private;
+        struct ceph_osd_client *osdc;
+        if (!osd)
+                return;
+        dout("osd_reset osd%d\n", osd->o_osd);
+        osdc = osd->o_osdc;
+        down_read(&osdc->map_sem);
+        kick_requests(osdc, osd);
+        up_read(&osdc->map_sem);
+}
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+{
+        struct ceph_osd *osd;
+        osd = kzalloc(sizeof(*osd), GFP_NOFS);
+        if (!osd)
+                return NULL;
+        atomic_set(&osd->o_ref, 1);
+        osd->o_osdc = osdc;
+        INIT_LIST_HEAD(&osd->o_requests);
+        INIT_LIST_HEAD(&osd->o_osd_lru);
+        osd->o_incarnation = 1;
+        ceph_con_init(osdc->client->msgr, &osd->o_con);
+        osd->o_con.private = osd;
+        osd->o_con.ops = &osd_con_ops;
+        osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+        INIT_LIST_HEAD(&osd->o_keepalive_item);
+        return osd;
+}
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+        if (atomic_inc_not_zero(&osd->o_ref)) {
+                dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
+                     atomic_read(&osd->o_ref));
+                return osd;
+        } else {
+                dout("get_osd %p FAIL\n", osd);
+                return NULL;
+        }
+}
+static void put_osd(struct ceph_osd *osd)
+{
+        dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
+             atomic_read(&osd->o_ref) - 1);
+        if (atomic_dec_and_test(&osd->o_ref))
+                kfree(osd);
+}
+/*
+ * remove an osd from our map
+ */
+static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+        dout("__remove_osd %p\n", osd);
+        BUG_ON(!list_empty(&osd->o_requests));
+        rb_erase(&osd->o_node, &osdc->osds);
+        list_del_init(&osd->o_osd_lru);
+        ceph_con_close(&osd->o_con);
+        put_osd(osd);
+}
+static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+                              struct ceph_osd *osd)
+{
+        dout("__move_osd_to_lru %p\n", osd);
+        BUG_ON(!list_empty(&osd->o_osd_lru));
+        list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+        osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
+}
+static void __remove_osd_from_lru(struct ceph_osd *osd)
+{
+        dout("__remove_osd_from_lru %p\n", osd);
+        if (!list_empty(&osd->o_osd_lru))
+                list_del_init(&osd->o_osd_lru);
+}
+static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
+{
+        struct ceph_osd *osd, *nosd;
+        dout("__remove_old_osds %p\n", osdc);
+        mutex_lock(&osdc->request_mutex);
+        list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+                if (!remove_all && time_before(jiffies, osd->lru_ttl))
+                        break;
+                __remove_osd(osdc, osd);
+        }
+        mutex_unlock(&osdc->request_mutex);
+}
+/*
+ * reset osd connect
+ */
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+        struct ceph_osd_request *req;
+        int ret = 0;
+        dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
+        if (list_empty(&osd->o_requests)) {
+                __remove_osd(osdc, osd);
+        } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
+                          &osd->o_con.peer_addr,
+                          sizeof(osd->o_con.peer_addr)) == 0 &&
+                   !ceph_con_opened(&osd->o_con)) {
+                dout(" osd addr hasn't changed and connection never opened,"
+                     " letting msgr retry");
+                /* touch each r_stamp for handle_timeout()'s benfit */
+                list_for_each_entry(req, &osd->o_requests, r_osd_item)
+                        req->r_stamp = jiffies;
+                ret = -EAGAIN;
+        } else {
+                ceph_con_close(&osd->o_con);
+                ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+                osd->o_incarnation++;
+        }
+        return ret;
+}
+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+{
+        struct rb_node **p = &osdc->osds.rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_osd *osd = NULL;
+        while (*p) {
+                parent = *p;
+                osd = rb_entry(parent, struct ceph_osd, o_node);
+                if (new->o_osd < osd->o_osd)
+                        p = &(*p)->rb_left;
+                else if (new->o_osd > osd->o_osd)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&new->o_node, parent, p);
+        rb_insert_color(&new->o_node, &osdc->osds);
+}
+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+{
+        struct ceph_osd *osd;
+        struct rb_node *n = osdc->osds.rb_node;
+        while (n) {
+                osd = rb_entry(n, struct ceph_osd, o_node);
+                if (o < osd->o_osd)
+                        n = n->rb_left;
+                else if (o > osd->o_osd)
+                        n = n->rb_right;
+                else
+                        return osd;
+        }
+        return NULL;
+}
+static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+{
+        schedule_delayed_work(&osdc->timeout_work,
+                        osdc->client->mount_args->osd_keepalive_timeout * HZ);
+}
+static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+{
+        cancel_delayed_work(&osdc->timeout_work);
+}
+/*
+ * Register request, assign tid.  If this is the first request, set up
+ * the timeout event.
+ */
+static void register_request(struct ceph_osd_client *osdc,
+                             struct ceph_osd_request *req)
+{
+        mutex_lock(&osdc->request_mutex);
+        req->r_tid = ++osdc->last_tid;
+        req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+        INIT_LIST_HEAD(&req->r_req_lru_item);
+        dout("register_request %p tid %lld\n", req, req->r_tid);
+        __insert_request(osdc, req);
+        ceph_osdc_get_request(req);
+        osdc->num_requests++;
+        if (osdc->num_requests == 1) {
+                dout(" first request, scheduling timeout\n");
+                __schedule_osd_timeout(osdc);
+        }
+        mutex_unlock(&osdc->request_mutex);
+}
+/*
+ * called under osdc->request_mutex
+ */
+static void __unregister_request(struct ceph_osd_client *osdc,
+                                 struct ceph_osd_request *req)
+{
+        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+        rb_erase(&req->r_node, &osdc->requests);
+        osdc->num_requests--;
+        if (req->r_osd) {
+                /* make sure the original request isn't in flight. */
+                ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+                list_del_init(&req->r_osd_item);
+                if (list_empty(&req->r_osd->o_requests))
+                        __move_osd_to_lru(osdc, req->r_osd);
+                req->r_osd = NULL;
+        }
+        ceph_osdc_put_request(req);
+        list_del_init(&req->r_req_lru_item);
+        if (osdc->num_requests == 0) {
+                dout(" no requests, canceling timeout\n");
+                __cancel_osd_timeout(osdc);
+        }
+}
+/*
+ * Cancel a previously queued request message
+ */
+static void __cancel_request(struct ceph_osd_request *req)
+{
+        if (req->r_sent) {
+                ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+                req->r_sent = 0;
+        }
+        list_del_init(&req->r_req_lru_item);
+}
+/*
+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
+ * (as needed), and set the request r_osd appropriately.  If there is
+ * no up osd, set r_osd to NULL.
+ *
+ * Return 0 if unchanged, 1 if changed, or negative on error.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __map_osds(struct ceph_osd_client *osdc,
+                      struct ceph_osd_request *req)
+{
+        struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+        struct ceph_pg pgid;
+        int acting[CEPH_PG_MAX_SIZE];
+        int o = -1, num = 0;
+        int err;
+        dout("map_osds %p tid %lld\n", req, req->r_tid);
+        err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+                                      &req->r_file_layout, osdc->osdmap);
+        if (err)
+                return err;
+        pgid = reqhead->layout.ol_pgid;
+        req->r_pgid = pgid;
+        err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
+        if (err > 0) {
+                o = acting[0];
+                num = err;
+        }
+        if ((req->r_osd && req->r_osd->o_osd == o &&
+             req->r_sent >= req->r_osd->o_incarnation &&
+             req->r_num_pg_osds == num &&
+             memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
+            (req->r_osd == NULL && o == -1))
+                return 0;  /* no change */
+        dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
+             req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
+             req->r_osd ? req->r_osd->o_osd : -1);
+        /* record full pg acting set */
+        memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
+        req->r_num_pg_osds = num;
+        if (req->r_osd) {
+                __cancel_request(req);
+                list_del_init(&req->r_osd_item);
+                req->r_osd = NULL;
+        }
+        req->r_osd = __lookup_osd(osdc, o);
+        if (!req->r_osd && o >= 0) {
+                err = -ENOMEM;
+                req->r_osd = create_osd(osdc);
+                if (!req->r_osd)
+                        goto out;
+                dout("map_osds osd %p is osd%d\n", req->r_osd, o);
+                req->r_osd->o_osd = o;
+                req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
+                __insert_osd(osdc, req->r_osd);
+                ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+        }
+        if (req->r_osd) {
+                __remove_osd_from_lru(req->r_osd);
+                list_add(&req->r_osd_item, &req->r_osd->o_requests);
+        }
+        err = 1;   /* osd or pg changed */
+out:
+        return err;
+}
+/*
+ * caller should hold map_sem (for read) and request_mutex
+ */
+static int __send_request(struct ceph_osd_client *osdc,
+                          struct ceph_osd_request *req)
+{
+        struct ceph_osd_request_head *reqhead;
+        int err;
+        err = __map_osds(osdc, req);
+        if (err < 0)
+                return err;
+        if (req->r_osd == NULL) {
+                dout("send_request %p no up osds in pg\n", req);
+                ceph_monc_request_next_osdmap(&osdc->client->monc);
+                return 0;
+        }
+        dout("send_request %p tid %llu to osd%d flags %d\n",
+             req, req->r_tid, req->r_osd->o_osd, req->r_flags);
+        reqhead = req->r_request->front.iov_base;
+        reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
+        reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
+        reqhead->reassert_version = req->r_reassert_version;
+        req->r_stamp = jiffies;
+        list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
+        ceph_msg_get(req->r_request); /* send consumes a ref */
+        ceph_con_send(&req->r_osd->o_con, req->r_request);
+        req->r_sent = req->r_osd->o_incarnation;
+        return 0;
+}
+/*
+ * Timeout callback, called every N seconds when 1 or more osd
+ * requests has been active for more than N seconds.  When this
+ * happens, we ping all OSDs with requests who have timed out to
+ * ensure any communications channel reset is detected.  Reset the
+ * request timeouts another N seconds in the future as we go.
+ * Reschedule the timeout event another N seconds in future (unless
+ * there are no open requests).
+ */
+static void handle_timeout(struct work_struct *work)
+{
+        struct ceph_osd_client *osdc =
+                container_of(work, struct ceph_osd_client, timeout_work.work);
+        struct ceph_osd_request *req, *last_req = NULL;
+        struct ceph_osd *osd;
+        unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
+        unsigned long keepalive =
+                osdc->client->mount_args->osd_keepalive_timeout * HZ;
+        unsigned long last_stamp = 0;
+        struct rb_node *p;
+        struct list_head slow_osds;
+        dout("timeout\n");
+        down_read(&osdc->map_sem);
+        ceph_monc_request_next_osdmap(&osdc->client->monc);
+        mutex_lock(&osdc->request_mutex);
+        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+                req = rb_entry(p, struct ceph_osd_request, r_node);
+                if (req->r_resend) {
+                        int err;
+                        dout("osdc resending prev failed %lld\n", req->r_tid);
+                        err = __send_request(osdc, req);
+                        if (err)
+                                dout("osdc failed again on %lld\n", req->r_tid);
+                        else
+                                req->r_resend = false;
+                        continue;
+                }
+        }
+        /*
+         * reset osds that appear to be _really_ unresponsive.  this
+         * is a failsafe measure.. we really shouldn't be getting to
+         * this point if the system is working properly.  the monitors
+         * should mark the osd as failed and we should find out about
+         * it from an updated osd map.
+         */
+        while (!list_empty(&osdc->req_lru)) {
+                req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
+                                 r_req_lru_item);
+                if (time_before(jiffies, req->r_stamp + timeout))
+                        break;
+                BUG_ON(req == last_req && req->r_stamp == last_stamp);
+                last_req = req;
+                last_stamp = req->r_stamp;
+                osd = req->r_osd;
+                BUG_ON(!osd);
+                pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
+                           req->r_tid, osd->o_osd);
+                __kick_requests(osdc, osd);
+        }
+        /*
+         * ping osds that are a bit slow.  this ensures that if there
+         * is a break in the TCP connection we will notice, and reopen
+         * a connection with that osd (from the fault callback).
+         */
+        INIT_LIST_HEAD(&slow_osds);
+        list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+                if (time_before(jiffies, req->r_stamp + keepalive))
+                        break;
+                osd = req->r_osd;
+                BUG_ON(!osd);
+                dout(" tid %llu is slow, will send keepalive on osd%d\n",
+                     req->r_tid, osd->o_osd);
+                list_move_tail(&osd->o_keepalive_item, &slow_osds);
+        }
+        while (!list_empty(&slow_osds)) {
+                osd = list_entry(slow_osds.next, struct ceph_osd,
+                                 o_keepalive_item);
+                list_del_init(&osd->o_keepalive_item);
+                ceph_con_keepalive(&osd->o_con);
+        }
+        __schedule_osd_timeout(osdc);
+        mutex_unlock(&osdc->request_mutex);
+        up_read(&osdc->map_sem);
+}
+static void handle_osds_timeout(struct work_struct *work)
+{
+        struct ceph_osd_client *osdc =
+                container_of(work, struct ceph_osd_client,
+                             osds_timeout_work.work);
+        unsigned long delay =
+                osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
+        dout("osds timeout\n");
+        down_read(&osdc->map_sem);
+        remove_old_osds(osdc, 0);
+        up_read(&osdc->map_sem);
+        schedule_delayed_work(&osdc->osds_timeout_work,
+                              round_jiffies_relative(delay));
+}
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
+                         struct ceph_connection *con)
+{
+        struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+        struct ceph_osd_request *req;
+        u64 tid;
+        int numops, object_len, flags;
+        s32 result;
+        tid = le64_to_cpu(msg->hdr.tid);
+        if (msg->front.iov_len < sizeof(*rhead))
+                goto bad;
+        numops = le32_to_cpu(rhead->num_ops);
+        object_len = le32_to_cpu(rhead->object_len);
+        result = le32_to_cpu(rhead->result);
+        if (msg->front.iov_len != sizeof(*rhead) + object_len +
+            numops * sizeof(struct ceph_osd_op))
+                goto bad;
+        dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
+        /* lookup */
+        mutex_lock(&osdc->request_mutex);
+        req = __lookup_request(osdc, tid);
+        if (req == NULL) {
+                dout("handle_reply tid %llu dne\n", tid);
+                mutex_unlock(&osdc->request_mutex);
+                return;
+        }
+        ceph_osdc_get_request(req);
+        flags = le32_to_cpu(rhead->flags);
+        /*
+         * if this connection filled our message, drop our reference now, to
+         * avoid a (safe but slower) revoke later.
+         */
+        if (req->r_con_filling_msg == con && req->r_reply == msg) {
+                dout(" dropping con_filling_msg ref %p\n", con);
+                req->r_con_filling_msg = NULL;
+                ceph_con_put(con);
+        }
+        if (!req->r_got_reply) {
+                unsigned bytes;
+                req->r_result = le32_to_cpu(rhead->result);
+                bytes = le32_to_cpu(msg->hdr.data_len);
+                dout("handle_reply result %d bytes %d\n", req->r_result,
+                     bytes);
+                if (req->r_result == 0)
+                        req->r_result = bytes;
+                /* in case this is a write and we need to replay, */
+                req->r_reassert_version = rhead->reassert_version;
+                req->r_got_reply = 1;
+        } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+                dout("handle_reply tid %llu dup ack\n", tid);
+                mutex_unlock(&osdc->request_mutex);
+                goto done;
+        }
+        dout("handle_reply tid %llu flags %d\n", tid, flags);
+        /* either this is a read, or we got the safe response */
+        if (result < 0 ||
+            (flags & CEPH_OSD_FLAG_ONDISK) ||
+            ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+                __unregister_request(osdc, req);
+        mutex_unlock(&osdc->request_mutex);
+        if (req->r_callback)
+                req->r_callback(req, msg);
+        else
+                complete(&req->r_completion);
+        if (flags & CEPH_OSD_FLAG_ONDISK) {
+                if (req->r_safe_callback)
+                        req->r_safe_callback(req, msg);
+                complete(&req->r_safe_completion);  /* fsync waiter */
+        }
+done:
+        ceph_osdc_put_request(req);
+        return;
+bad:
+        pr_err("corrupt osd_op_reply got %d %d expected %d\n",
+               (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
+               (int)sizeof(*rhead));
+        ceph_msg_dump(msg);
+}
+static int __kick_requests(struct ceph_osd_client *osdc,
+                          struct ceph_osd *kickosd)
+{
+        struct ceph_osd_request *req;
+        struct rb_node *p, *n;
+        int needmap = 0;
+        int err;
+        dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
+        if (kickosd) {
+                err = __reset_osd(osdc, kickosd);
+                if (err == -EAGAIN)
+                        return 1;
+        } else {
+                for (p = rb_first(&osdc->osds); p; p = n) {
+                        struct ceph_osd *osd =
+                                rb_entry(p, struct ceph_osd, o_node);
+                        n = rb_next(p);
+                        if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+                            memcmp(&osd->o_con.peer_addr,
+                                   ceph_osd_addr(osdc->osdmap,
+                                                 osd->o_osd),
+                                   sizeof(struct ceph_entity_addr)) != 0)
+                                __reset_osd(osdc, osd);
+                }
+        }
+        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+                req = rb_entry(p, struct ceph_osd_request, r_node);
+                if (req->r_resend) {
+                        dout(" r_resend set on tid %llu\n", req->r_tid);
+                        __cancel_request(req);
+                        goto kick;
+                }
+                if (req->r_osd && kickosd == req->r_osd) {
+                        __cancel_request(req);
+                        goto kick;
+                }
+                err = __map_osds(osdc, req);
+                if (err == 0)
+                        continue;  /* no change */
+                if (err < 0) {
+                        /*
+                         * FIXME: really, we should set the request
+                         * error and fail if this isn't a 'nofail'
+                         * request, but that's a fair bit more
+                         * complicated to do.  So retry!
+                         */
+                        dout(" setting r_resend on %llu\n", req->r_tid);
+                        req->r_resend = true;
+                        continue;
+                }
+                if (req->r_osd == NULL) {
+                        dout("tid %llu maps to no valid osd\n", req->r_tid);
+                        needmap++;  /* request a newer map */
+                        continue;
+                }
+kick:
+                dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
+                     req->r_osd ? req->r_osd->o_osd : -1);
+                req->r_flags |= CEPH_OSD_FLAG_RETRY;
+                err = __send_request(osdc, req);
+                if (err) {
+                        dout(" setting r_resend on %llu\n", req->r_tid);
+                        req->r_resend = true;
+                }
+        }
+        return needmap;
+}
+/*
+ * Resubmit osd requests whose osd or osd address has changed.  Request
+ * a new osd map if osds are down, or we are otherwise unable to determine
+ * how to direct a request.
+ *
+ * Close connections to down osds.
+ *
+ * If @who is specified, resubmit requests for that specific osd.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static void kick_requests(struct ceph_osd_client *osdc,
+                          struct ceph_osd *kickosd)
+{
+        int needmap;
+        mutex_lock(&osdc->request_mutex);
+        needmap = __kick_requests(osdc, kickosd);
+        mutex_unlock(&osdc->request_mutex);
+        if (needmap) {
+                dout("%d requests for down osds, need new map\n", needmap);
+                ceph_monc_request_next_osdmap(&osdc->client->monc);
+        }
+}
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster.  Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+        void *p, *end, *next;
+        u32 nr_maps, maplen;
+        u32 epoch;
+        struct ceph_osdmap *newmap = NULL, *oldmap;
+        int err;
+        struct ceph_fsid fsid;
+        dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+        p = msg->front.iov_base;
+        end = p + msg->front.iov_len;
+        /* verify fsid */
+        ceph_decode_need(&p, end, sizeof(fsid), bad);
+        ceph_decode_copy(&p, &fsid, sizeof(fsid));
+        if (ceph_check_fsid(osdc->client, &fsid) < 0)
+                return;
+        down_write(&osdc->map_sem);
+        /* incremental maps */
+        ceph_decode_32_safe(&p, end, nr_maps, bad);
+        dout(" %d inc maps\n", nr_maps);
+        while (nr_maps > 0) {
+                ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+                epoch = ceph_decode_32(&p);
+                maplen = ceph_decode_32(&p);
+                ceph_decode_need(&p, end, maplen, bad);
+                next = p + maplen;
+                if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+                        dout("applying incremental map %u len %d\n",
+                             epoch, maplen);
+                        newmap = osdmap_apply_incremental(&p, next,
+                                                          osdc->osdmap,
+                                                          osdc->client->msgr);
+                        if (IS_ERR(newmap)) {
+                                err = PTR_ERR(newmap);
+                                goto bad;
+                        }
+                        BUG_ON(!newmap);
+                        if (newmap != osdc->osdmap) {
+                                ceph_osdmap_destroy(osdc->osdmap);
+                                osdc->osdmap = newmap;
+                        }
+                } else {
+                        dout("ignoring incremental map %u len %d\n",
+                             epoch, maplen);
+                }
+                p = next;
+                nr_maps--;
+        }
+        if (newmap)
+                goto done;
+        /* full maps */
+        ceph_decode_32_safe(&p, end, nr_maps, bad);
+        dout(" %d full maps\n", nr_maps);
+        while (nr_maps) {
+                ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+                epoch = ceph_decode_32(&p);
+                maplen = ceph_decode_32(&p);
+                ceph_decode_need(&p, end, maplen, bad);
+                if (nr_maps > 1) {
+                        dout("skipping non-latest full map %u len %d\n",
+                             epoch, maplen);
+                } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+                        dout("skipping full map %u len %d, "
+                             "older than our %u\n", epoch, maplen,
+                             osdc->osdmap->epoch);
+                } else {
+                        dout("taking full map %u len %d\n", epoch, maplen);
+                        newmap = osdmap_decode(&p, p+maplen);
+                        if (IS_ERR(newmap)) {
+                                err = PTR_ERR(newmap);
+                                goto bad;
+                        }
+                        BUG_ON(!newmap);
+                        oldmap = osdc->osdmap;
+                        osdc->osdmap = newmap;
+                        if (oldmap)
+                                ceph_osdmap_destroy(oldmap);
+                }
+                p += maplen;
+                nr_maps--;
+        }
+done:
+        downgrade_write(&osdc->map_sem);
+        ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+        if (newmap)
+                kick_requests(osdc, NULL);
+        up_read(&osdc->map_sem);
+        return;
+bad:
+        pr_err("osdc handle_map corrupt msg\n");
+        ceph_msg_dump(msg);
+        up_write(&osdc->map_sem);
+        return;
+}
+/*
+ * A read request prepares specific pages that data is to be read into.
+ * When a message is being read off the wire, we call prepare_pages to
+ * find those pages.
+ *  0 = success, -1 failure.
+ */
+static int __prepare_pages(struct ceph_connection *con,
+                         struct ceph_msg_header *hdr,
+                         struct ceph_osd_request *req,
+                         u64 tid,
+                         struct ceph_msg *m)
+{
+        struct ceph_osd *osd = con->private;
+        struct ceph_osd_client *osdc;
+        int ret = -1;
+        int data_len = le32_to_cpu(hdr->data_len);
+        unsigned data_off = le16_to_cpu(hdr->data_off);
+        int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+        if (!osd)
+                return -1;
+        osdc = osd->o_osdc;
+        dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
+             tid, req->r_num_pages, want);
+        if (unlikely(req->r_num_pages < want))
+                goto out;
+        m->pages = req->r_pages;
+        m->nr_pages = req->r_num_pages;
+        ret = 0; /* success */
+out:
+        BUG_ON(ret < 0 || m->nr_pages < want);
+        return ret;
+}
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+                            struct ceph_osd_request *req,
+                            bool nofail)
+{
+        int rc = 0;
+        req->r_request->pages = req->r_pages;
+        req->r_request->nr_pages = req->r_num_pages;
+        register_request(osdc, req);
+        down_read(&osdc->map_sem);
+        mutex_lock(&osdc->request_mutex);
+        /*
+         * a racing kick_requests() may have sent the message for us
+         * while we dropped request_mutex above, so only send now if
+         * the request still han't been touched yet.
+         */
+        if (req->r_sent == 0) {
+                rc = __send_request(osdc, req);
+                if (rc) {
+                        if (nofail) {
+                                dout("osdc_start_request failed send, "
+                                     " marking %lld\n", req->r_tid);
+                                req->r_resend = true;
+                                rc = 0;
+                        } else {
+                                __unregister_request(osdc, req);
+                        }
+                }
+        }
+        mutex_unlock(&osdc->request_mutex);
+        up_read(&osdc->map_sem);
+        return rc;
+}
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+                           struct ceph_osd_request *req)
+{
+        int rc;
+        rc = wait_for_completion_interruptible(&req->r_completion);
+        if (rc < 0) {
+                mutex_lock(&osdc->request_mutex);
+                __cancel_request(req);
+                __unregister_request(osdc, req);
+                mutex_unlock(&osdc->request_mutex);
+                dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
+                return rc;
+        }
+        dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+        return req->r_result;
+}
+/*
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+        struct ceph_osd_request *req;
+        u64 last_tid, next_tid = 0;
+        mutex_lock(&osdc->request_mutex);
+        last_tid = osdc->last_tid;
+        while (1) {
+                req = __lookup_request_ge(osdc, next_tid);
+                if (!req)
+                        break;
+                if (req->r_tid > last_tid)
+                        break;
+                next_tid = req->r_tid + 1;
+                if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+                        continue;
+                ceph_osdc_get_request(req);
+                mutex_unlock(&osdc->request_mutex);
+                dout("sync waiting on tid %llu (last is %llu)\n",
+                     req->r_tid, last_tid);
+                wait_for_completion(&req->r_safe_completion);
+                mutex_lock(&osdc->request_mutex);
+                ceph_osdc_put_request(req);
+        }
+        mutex_unlock(&osdc->request_mutex);
+        dout("sync done (thru tid %llu)\n", last_tid);
+}
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+        int err;
+        dout("init\n");
+        osdc->client = client;
+        osdc->osdmap = NULL;
+        init_rwsem(&osdc->map_sem);
+        init_completion(&osdc->map_waiters);
+        osdc->last_requested_map = 0;
+        mutex_init(&osdc->request_mutex);
+        osdc->last_tid = 0;
+        osdc->osds = RB_ROOT;
+        INIT_LIST_HEAD(&osdc->osd_lru);
+        osdc->requests = RB_ROOT;
+        INIT_LIST_HEAD(&osdc->req_lru);
+        osdc->num_requests = 0;
+        INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+        INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+        schedule_delayed_work(&osdc->osds_timeout_work,
+           round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
+        err = -ENOMEM;
+        osdc->req_mempool = mempool_create_kmalloc_pool(10,
+                                        sizeof(struct ceph_osd_request));
+        if (!osdc->req_mempool)
+                goto out;
+        err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
+        if (err < 0)
+                goto out_mempool;
+        err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+                                OSD_OPREPLY_FRONT_LEN, 10, true);
+        if (err < 0)
+                goto out_msgpool;
+        return 0;
+out_msgpool:
+        ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+        mempool_destroy(osdc->req_mempool);
+out:
+        return err;
+}
+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+        cancel_delayed_work_sync(&osdc->timeout_work);
+        cancel_delayed_work_sync(&osdc->osds_timeout_work);
+        if (osdc->osdmap) {
+                ceph_osdmap_destroy(osdc->osdmap);
+                osdc->osdmap = NULL;
+        }
+        remove_old_osds(osdc, 1);
+        mempool_destroy(osdc->req_mempool);
+        ceph_msgpool_destroy(&osdc->msgpool_op);
+        ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+/*
+ * Read some contiguous pages.  If we cross a stripe boundary, shorten
+ * *plen.  Return number of bytes read, or error.
+ */
+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+                        struct ceph_vino vino, struct ceph_file_layout *layout,
+                        u64 off, u64 *plen,
+                        u32 truncate_seq, u64 truncate_size,
+                        struct page **pages, int num_pages)
+{
+        struct ceph_osd_request *req;
+        int rc = 0;
+        dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+             vino.snap, off, *plen);
+        req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+                                    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+                                    NULL, 0, truncate_seq, truncate_size, NULL,
+                                    false, 1);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        /* it may be a short read due to an object boundary */
+        req->r_pages = pages;
+        num_pages = calc_pages_for(off, *plen);
+        req->r_num_pages = num_pages;
+        dout("readpages  final extent is %llu~%llu (%d pages)\n",
+             off, *plen, req->r_num_pages);
+        rc = ceph_osdc_start_request(osdc, req, false);
+        if (!rc)
+                rc = ceph_osdc_wait_request(osdc, req);
+        ceph_osdc_put_request(req);
+        dout("readpages result %d\n", rc);
+        return rc;
+}
+/*
+ * do a synchronous write on N pages
+ */
+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
+                         struct ceph_file_layout *layout,
+                         struct ceph_snap_context *snapc,
+                         u64 off, u64 len,
+                         u32 truncate_seq, u64 truncate_size,
+                         struct timespec *mtime,
+                         struct page **pages, int num_pages,
+                         int flags, int do_sync, bool nofail)
+{
+        struct ceph_osd_request *req;
+        int rc = 0;
+        BUG_ON(vino.snap != CEPH_NOSNAP);
+        req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+                                    CEPH_OSD_OP_WRITE,
+                                    flags | CEPH_OSD_FLAG_ONDISK |
+                                            CEPH_OSD_FLAG_WRITE,
+                                    snapc, do_sync,
+                                    truncate_seq, truncate_size, mtime,
+                                    nofail, 1);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        /* it may be a short write due to an object boundary */
+        req->r_pages = pages;
+        req->r_num_pages = calc_pages_for(off, len);
+        dout("writepages %llu~%llu (%d pages)\n", off, len,
+             req->r_num_pages);
+        rc = ceph_osdc_start_request(osdc, req, nofail);
+        if (!rc)
+                rc = ceph_osdc_wait_request(osdc, req);
+        ceph_osdc_put_request(req);
+        if (rc == 0)
+                rc = len;
+        dout("writepages result %d\n", rc);
+        return rc;
+}
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        struct ceph_osd *osd = con->private;
+        struct ceph_osd_client *osdc;
+        int type = le16_to_cpu(msg->hdr.type);
+        if (!osd)
+                return;
+        osdc = osd->o_osdc;
+        switch (type) {
+        case CEPH_MSG_OSD_MAP:
+                ceph_osdc_handle_map(osdc, msg);
+                break;
+        case CEPH_MSG_OSD_OPREPLY:
+                handle_reply(osdc, msg, con);
+                break;
+        default:
+                pr_err("received unknown message type %d %s\n", type,
+                       ceph_msg_type_name(type));
+        }
+        ceph_msg_put(msg);
+}
+/*
+ * lookup and return message for incoming reply
+ */
+static struct ceph_msg *get_reply(struct ceph_connection *con,
+                                  struct ceph_msg_header *hdr,
+                                  int *skip)
+{
+        struct ceph_osd *osd = con->private;
+        struct ceph_osd_client *osdc = osd->o_osdc;
+        struct ceph_msg *m;
+        struct ceph_osd_request *req;
+        int front = le32_to_cpu(hdr->front_len);
+        int data_len = le32_to_cpu(hdr->data_len);
+        u64 tid;
+        int err;
+        tid = le64_to_cpu(hdr->tid);
+        mutex_lock(&osdc->request_mutex);
+        req = __lookup_request(osdc, tid);
+        if (!req) {
+                *skip = 1;
+                m = NULL;
+                pr_info("get_reply unknown tid %llu from osd%d\n", tid,
+                        osd->o_osd);
+                goto out;
+        }
+        if (req->r_con_filling_msg) {
+                dout("get_reply revoking msg %p from old con %p\n",
+                     req->r_reply, req->r_con_filling_msg);
+                ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+                ceph_con_put(req->r_con_filling_msg);
+        }
+        if (front > req->r_reply->front.iov_len) {
+                pr_warning("get_reply front %d > preallocated %d\n",
+                           front, (int)req->r_reply->front.iov_len);
+                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
+                if (IS_ERR(m))
+                        goto out;
+                ceph_msg_put(req->r_reply);
+                req->r_reply = m;
+        }
+        m = ceph_msg_get(req->r_reply);
+        if (data_len > 0) {
+                err = __prepare_pages(con, hdr, req, tid, m);
+                if (err < 0) {
+                        *skip = 1;
+                        ceph_msg_put(m);
+                        m = ERR_PTR(err);
+                }
+        }
+        *skip = 0;
+        req->r_con_filling_msg = ceph_con_get(con);
+        dout("get_reply tid %lld %p\n", tid, m);
+out:
+        mutex_unlock(&osdc->request_mutex);
+        return m;
+}
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+                                  struct ceph_msg_header *hdr,
+                                  int *skip)
+{
+        struct ceph_osd *osd = con->private;
+        int type = le16_to_cpu(hdr->type);
+        int front = le32_to_cpu(hdr->front_len);
+        switch (type) {
+        case CEPH_MSG_OSD_MAP:
+                return ceph_msg_new(type, front, 0, 0, NULL);
+        case CEPH_MSG_OSD_OPREPLY:
+                return get_reply(con, hdr, skip);
+        default:
+                pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+                        osd->o_osd);
+                *skip = 1;
+                return NULL;
+        }
+}
+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+{
+        struct ceph_osd *osd = con->private;
+        if (get_osd(osd))
+                return con;
+        return NULL;
+}
+static void put_osd_con(struct ceph_connection *con)
+{
+        struct ceph_osd *osd = con->private;
+        put_osd(osd);
+}
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+                          void **buf, int *len, int *proto,
+                          void **reply_buf, int *reply_len, int force_new)
+{
+        struct ceph_osd *o = con->private;
+        struct ceph_osd_client *osdc = o->o_osdc;
+        struct ceph_auth_client *ac = osdc->client->monc.auth;
+        int ret = 0;
+        if (force_new && o->o_authorizer) {
+                ac->ops->destroy_authorizer(ac, o->o_authorizer);
+                o->o_authorizer = NULL;
+        }
+        if (o->o_authorizer == NULL) {
+                ret = ac->ops->create_authorizer(
+                        ac, CEPH_ENTITY_TYPE_OSD,
+                        &o->o_authorizer,
+                        &o->o_authorizer_buf,
+                        &o->o_authorizer_buf_len,
+                        &o->o_authorizer_reply_buf,
+                        &o->o_authorizer_reply_buf_len);
+                if (ret)
+                return ret;
+        }
+        *proto = ac->protocol;
+        *buf = o->o_authorizer_buf;
+        *len = o->o_authorizer_buf_len;
+        *reply_buf = o->o_authorizer_reply_buf;
+        *reply_len = o->o_authorizer_reply_buf_len;
+        return 0;
+}
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+        struct ceph_osd *o = con->private;
+        struct ceph_osd_client *osdc = o->o_osdc;
+        struct ceph_auth_client *ac = osdc->client->monc.auth;
+        return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
+}
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+        struct ceph_osd *o = con->private;
+        struct ceph_osd_client *osdc = o->o_osdc;
+        struct ceph_auth_client *ac = osdc->client->monc.auth;
+        if (ac->ops->invalidate_authorizer)
+                ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
+        return ceph_monc_validate_auth(&osdc->client->monc);
+}
+const static struct ceph_connection_operations osd_con_ops = {
+        .get = get_osd_con,
+        .put = put_osd_con,
+        .dispatch = dispatch,
+        .get_authorizer = get_authorizer,
+        .verify_authorizer_reply = verify_authorizer_reply,
+        .invalidate_authorizer = invalidate_authorizer,
+        .alloc_msg = alloc_msg,
+        .fault = osd_reset,
+};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..ce776989ef6a
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,167 @@
+#ifndef _FS_CEPH_OSD_CLIENT_H
+#define _FS_CEPH_OSD_CLIENT_H
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/mempool.h>
+#include <linux/rbtree.h>
+#include "types.h"
+#include "osdmap.h"
+#include "messenger.h"
+struct ceph_msg;
+struct ceph_snap_context;
+struct ceph_osd_request;
+struct ceph_osd_client;
+struct ceph_authorizer;
+/*
+ * completion callback for async writepages
+ */
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
+                                     struct ceph_msg *);
+/* a given osd we're communicating with */
+struct ceph_osd {
+        atomic_t o_ref;
+        struct ceph_osd_client *o_osdc;
+        int o_osd;
+        int o_incarnation;
+        struct rb_node o_node;
+        struct ceph_connection o_con;
+        struct list_head o_requests;
+        struct list_head o_osd_lru;
+        struct ceph_authorizer *o_authorizer;
+        void *o_authorizer_buf, *o_authorizer_reply_buf;
+        size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
+        unsigned long lru_ttl;
+        int o_marked_for_keepalive;
+        struct list_head o_keepalive_item;
+};
+/* an in-flight request */
+struct ceph_osd_request {
+        u64             r_tid;              /* unique for this client */
+        struct rb_node  r_node;
+        struct list_head r_req_lru_item;
+        struct list_head r_osd_item;
+        struct ceph_osd *r_osd;
+        struct ceph_pg   r_pgid;
+        int              r_pg_osds[CEPH_PG_MAX_SIZE];
+        int              r_num_pg_osds;
+        struct ceph_connection *r_con_filling_msg;
+        struct ceph_msg  *r_request, *r_reply;
+        int               r_result;
+        int               r_flags;     /* any additional flags for the osd */
+        u32               r_sent;      /* >0 if r_request is sending/sent */
+        int               r_got_reply;
+        struct ceph_osd_client *r_osdc;
+        struct kref       r_kref;
+        bool              r_mempool;
+        struct completion r_completion, r_safe_completion;
+        ceph_osdc_callback_t r_callback, r_safe_callback;
+        struct ceph_eversion r_reassert_version;
+        struct list_head  r_unsafe_item;
+        struct inode *r_inode;                /* for use by callbacks */
+        char              r_oid[40];          /* object name */
+        int               r_oid_len;
+        unsigned long     r_stamp;            /* send OR check time */
+        bool              r_resend;           /* msg send failed, needs retry */
+        struct ceph_file_layout r_file_layout;
+        struct ceph_snap_context *r_snapc;    /* snap context for writes */
+        unsigned          r_num_pages;        /* size of page array (follows) */
+        struct page     **r_pages;            /* pages for data payload */
+        int               r_pages_from_pool;
+        int               r_own_pages;        /* if true, i own page list */
+};
+struct ceph_osd_client {
+        struct ceph_client     *client;
+        struct ceph_osdmap     *osdmap;       /* current map */
+        struct rw_semaphore    map_sem;
+        struct completion      map_waiters;
+        u64                    last_requested_map;
+        struct mutex           request_mutex;
+        struct rb_root         osds;          /* osds */
+        struct list_head       osd_lru;       /* idle osds */
+        u64                    timeout_tid;   /* tid of timeout triggering rq */
+        u64                    last_tid;      /* tid of last request */
+        struct rb_root         requests;      /* pending requests */
+        struct list_head       req_lru;       /* pending requests lru */
+        int                    num_requests;
+        struct delayed_work    timeout_work;
+        struct delayed_work    osds_timeout_work;
+#ifdef CONFIG_DEBUG_FS
+        struct dentry          *debugfs_file;
+#endif
+        mempool_t              *req_mempool;
+        struct ceph_msgpool     msgpool_op;
+        struct ceph_msgpool     msgpool_op_reply;
+};
+extern int ceph_osdc_init(struct ceph_osd_client *osdc,
+                          struct ceph_client *client);
+extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
+                                   struct ceph_msg *msg);
+extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
+                                 struct ceph_msg *msg);
+extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
+                                      struct ceph_file_layout *layout,
+                                      struct ceph_vino vino,
+                                      u64 offset, u64 *len, int op, int flags,
+                                      struct ceph_snap_context *snapc,
+                                      int do_sync, u32 truncate_seq,
+                                      u64 truncate_size,
+                                      struct timespec *mtime,
+                                      bool use_mempool, int num_reply);
+static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+        kref_get(&req->r_kref);
+}
+extern void ceph_osdc_release_request(struct kref *kref);
+static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+        kref_put(&req->r_kref, ceph_osdc_release_request);
+}
+extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+                                   struct ceph_osd_request *req,
+                                   bool nofail);
+extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+                                  struct ceph_osd_request *req);
+extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+                               struct ceph_vino vino,
+                               struct ceph_file_layout *layout,
+                               u64 off, u64 *plen,
+                               u32 truncate_seq, u64 truncate_size,
+                               struct page **pages, int nr_pages);
+extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
+                                struct ceph_vino vino,
+                                struct ceph_file_layout *layout,
+                                struct ceph_snap_context *sc,
+                                u64 off, u64 len,
+                                u32 truncate_seq, u64 truncate_size,
+                                struct timespec *mtime,
+                                struct page **pages, int nr_pages,
+                                int flags, int do_sync, bool nofail);
+#endif
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..cfdd8f4388b7
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1081 @@
+#include "ceph_debug.h"
+#include <linux/slab.h>
+#include <asm/div64.h>
+#include "super.h"
+#include "osdmap.h"
+#include "crush/hash.h"
+#include "crush/mapper.h"
+#include "decode.h"
+char *ceph_osdmap_state_str(char *str, int len, int state)
+{
+        int flag = 0;
+        if (!len)
+                goto done;
+        *str = '\0';
+        if (state) {
+                if (state & CEPH_OSD_EXISTS) {
+                        snprintf(str, len, "exists");
+                        flag = 1;
+                }
+                if (state & CEPH_OSD_UP) {
+                        snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
+                                 "up");
+                        flag = 1;
+                }
+        } else {
+                snprintf(str, len, "doesn't exist");
+        }
+done:
+        return str;
+}
+/* maps */
+static int calc_bits_of(unsigned t)
+{
+        int b = 0;
+        while (t) {
+                t = t >> 1;
+                b++;
+        }
+        return b;
+}
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+        pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
+        pi->pgp_num_mask =
+                (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
+        pi->lpg_num_mask =
+                (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
+        pi->lpgp_num_mask =
+                (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+}
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+                                       struct crush_bucket_uniform *b)
+{
+        dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+        ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+        b->item_weight = ceph_decode_32(p);
+        return 0;
+bad:
+        return -EINVAL;
+}
+static int crush_decode_list_bucket(void **p, void *end,
+                                    struct crush_bucket_list *b)
+{
+        int j;
+        dout("crush_decode_list_bucket %p to %p\n", *p, end);
+        b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+        if (b->item_weights == NULL)
+                return -ENOMEM;
+        b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+        if (b->sum_weights == NULL)
+                return -ENOMEM;
+        ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+        for (j = 0; j < b->h.size; j++) {
+                b->item_weights[j] = ceph_decode_32(p);
+                b->sum_weights[j] = ceph_decode_32(p);
+        }
+        return 0;
+bad:
+        return -EINVAL;
+}
+static int crush_decode_tree_bucket(void **p, void *end,
+                                    struct crush_bucket_tree *b)
+{
+        int j;
+        dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+        ceph_decode_32_safe(p, end, b->num_nodes, bad);
+        b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+        if (b->node_weights == NULL)
+                return -ENOMEM;
+        ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+        for (j = 0; j < b->num_nodes; j++)
+                b->node_weights[j] = ceph_decode_32(p);
+        return 0;
+bad:
+        return -EINVAL;
+}
+static int crush_decode_straw_bucket(void **p, void *end,
+                                     struct crush_bucket_straw *b)
+{
+        int j;
+        dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+        b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+        if (b->item_weights == NULL)
+                return -ENOMEM;
+        b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+        if (b->straws == NULL)
+                return -ENOMEM;
+        ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+        for (j = 0; j < b->h.size; j++) {
+                b->item_weights[j] = ceph_decode_32(p);
+                b->straws[j] = ceph_decode_32(p);
+        }
+        return 0;
+bad:
+        return -EINVAL;
+}
+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+        struct crush_map *c;
+        int err = -EINVAL;
+        int i, j;
+        void **p = &pbyval;
+        void *start = pbyval;
+        u32 magic;
+        dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+        c = kzalloc(sizeof(*c), GFP_NOFS);
+        if (c == NULL)
+                return ERR_PTR(-ENOMEM);
+        ceph_decode_need(p, end, 4*sizeof(u32), bad);
+        magic = ceph_decode_32(p);
+        if (magic != CRUSH_MAGIC) {
+                pr_err("crush_decode magic %x != current %x\n",
+                       (unsigned)magic, (unsigned)CRUSH_MAGIC);
+                goto bad;
+        }
+        c->max_buckets = ceph_decode_32(p);
+        c->max_rules = ceph_decode_32(p);
+        c->max_devices = ceph_decode_32(p);
+        c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
+        if (c->device_parents == NULL)
+                goto badmem;
+        c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
+        if (c->bucket_parents == NULL)
+                goto badmem;
+        c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+        if (c->buckets == NULL)
+                goto badmem;
+        c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+        if (c->rules == NULL)
+                goto badmem;
+        /* buckets */
+        for (i = 0; i < c->max_buckets; i++) {
+                int size = 0;
+                u32 alg;
+                struct crush_bucket *b;
+                ceph_decode_32_safe(p, end, alg, bad);
+                if (alg == 0) {
+                        c->buckets[i] = NULL;
+                        continue;
+                }
+                dout("crush_decode bucket %d off %x %p to %p\n",
+                     i, (int)(*p-start), *p, end);
+                switch (alg) {
+                case CRUSH_BUCKET_UNIFORM:
+                        size = sizeof(struct crush_bucket_uniform);
+                        break;
+                case CRUSH_BUCKET_LIST:
+                        size = sizeof(struct crush_bucket_list);
+                        break;
+                case CRUSH_BUCKET_TREE:
+                        size = sizeof(struct crush_bucket_tree);
+                        break;
+                case CRUSH_BUCKET_STRAW:
+                        size = sizeof(struct crush_bucket_straw);
+                        break;
+                default:
+                        err = -EINVAL;
+                        goto bad;
+                }
+                BUG_ON(size == 0);
+                b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+                if (b == NULL)
+                        goto badmem;
+                ceph_decode_need(p, end, 4*sizeof(u32), bad);
+                b->id = ceph_decode_32(p);
+                b->type = ceph_decode_16(p);
+                b->alg = ceph_decode_8(p);
+                b->hash = ceph_decode_8(p);
+                b->weight = ceph_decode_32(p);
+                b->size = ceph_decode_32(p);
+                dout("crush_decode bucket size %d off %x %p to %p\n",
+                     b->size, (int)(*p-start), *p, end);
+                b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+                if (b->items == NULL)
+                        goto badmem;
+                b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
+                if (b->perm == NULL)
+                        goto badmem;
+                b->perm_n = 0;
+                ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+                for (j = 0; j < b->size; j++)
+                        b->items[j] = ceph_decode_32(p);
+                switch (b->alg) {
+                case CRUSH_BUCKET_UNIFORM:
+                        err = crush_decode_uniform_bucket(p, end,
+                                  (struct crush_bucket_uniform *)b);
+                        if (err < 0)
+                                goto bad;
+                        break;
+                case CRUSH_BUCKET_LIST:
+                        err = crush_decode_list_bucket(p, end,
+                               (struct crush_bucket_list *)b);
+                        if (err < 0)
+                                goto bad;
+                        break;
+                case CRUSH_BUCKET_TREE:
+                        err = crush_decode_tree_bucket(p, end,
+                                (struct crush_bucket_tree *)b);
+                        if (err < 0)
+                                goto bad;
+                        break;
+                case CRUSH_BUCKET_STRAW:
+                        err = crush_decode_straw_bucket(p, end,
+                                (struct crush_bucket_straw *)b);
+                        if (err < 0)
+                                goto bad;
+                        break;
+                }
+        }
+        /* rules */
+        dout("rule vec is %p\n", c->rules);
+        for (i = 0; i < c->max_rules; i++) {
+                u32 yes;
+                struct crush_rule *r;
+                ceph_decode_32_safe(p, end, yes, bad);
+                if (!yes) {
+                        dout("crush_decode NO rule %d off %x %p to %p\n",
+                             i, (int)(*p-start), *p, end);
+                        c->rules[i] = NULL;
+                        continue;
+                }
+                dout("crush_decode rule %d off %x %p to %p\n",
+                     i, (int)(*p-start), *p, end);
+                /* len */
+                ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+                err = -EINVAL;
+                if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+                        goto bad;
+#endif
+                r = c->rules[i] = kmalloc(sizeof(*r) +
+                                          yes*sizeof(struct crush_rule_step),
+                                          GFP_NOFS);
+                if (r == NULL)
+                        goto badmem;
+                dout(" rule %d is at %p\n", i, r);
+                r->len = yes;
+                ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+                ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+                for (j = 0; j < r->len; j++) {
+                        r->steps[j].op = ceph_decode_32(p);
+                        r->steps[j].arg1 = ceph_decode_32(p);
+                        r->steps[j].arg2 = ceph_decode_32(p);
+                }
+        }
+        /* ignore trailing name maps. */
+        dout("crush_decode success\n");
+        return c;
+badmem:
+        err = -ENOMEM;
+bad:
+        dout("crush_decode fail %d\n", err);
+        crush_destroy(c);
+        return ERR_PTR(err);
+}
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds)
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+        u64 a = *(u64 *)&l;
+        u64 b = *(u64 *)&r;
+        if (a < b)
+                return -1;
+        if (a > b)
+                return 1;
+        return 0;
+}
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+                               struct rb_root *root)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_pg_mapping *pg = NULL;
+        int c;
+        while (*p) {
+                parent = *p;
+                pg = rb_entry(parent, struct ceph_pg_mapping, node);
+                c = pgid_cmp(new->pgid, pg->pgid);
+                if (c < 0)
+                        p = &(*p)->rb_left;
+                else if (c > 0)
+                        p = &(*p)->rb_right;
+                else
+                        return -EEXIST;
+        }
+        rb_link_node(&new->node, parent, p);
+        rb_insert_color(&new->node, root);
+        return 0;
+}
+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+                                                   struct ceph_pg pgid)
+{
+        struct rb_node *n = root->rb_node;
+        struct ceph_pg_mapping *pg;
+        int c;
+        while (n) {
+                pg = rb_entry(n, struct ceph_pg_mapping, node);
+                c = pgid_cmp(pgid, pg->pgid);
+                if (c < 0)
+                        n = n->rb_left;
+                else if (c > 0)
+                        n = n->rb_right;
+                else
+                        return pg;
+        }
+        return NULL;
+}
+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_pg_pool_info *pi = NULL;
+        while (*p) {
+                parent = *p;
+                pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+                if (new->id < pi->id)
+                        p = &(*p)->rb_left;
+                else if (new->id > pi->id)
+                        p = &(*p)->rb_right;
+                else
+                        return -EEXIST;
+        }
+        rb_link_node(&new->node, parent, p);
+        rb_insert_color(&new->node, root);
+        return 0;
+}
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+        struct ceph_pg_pool_info *pi;
+        struct rb_node *n = root->rb_node;
+        while (n) {
+                pi = rb_entry(n, struct ceph_pg_pool_info, node);
+                if (id < pi->id)
+                        n = n->rb_left;
+                else if (id > pi->id)
+                        n = n->rb_right;
+                else
+                        return pi;
+        }
+        return NULL;
+}
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+        rb_erase(&pi->node, root);
+        kfree(pi->name);
+        kfree(pi);
+}
+void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
+{
+        ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+        calc_pg_masks(pi);
+        *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+        *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+}
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+        struct ceph_pg_pool_info *pi;
+        u32 num, len, pool;
+        ceph_decode_32_safe(p, end, num, bad);
+        dout(" %d pool names\n", num);
+        while (num--) {
+                ceph_decode_32_safe(p, end, pool, bad);
+                ceph_decode_32_safe(p, end, len, bad);
+                dout("  pool %d len %d\n", pool, len);
+                pi = __lookup_pg_pool(&map->pg_pools, pool);
+                if (pi) {
+                        kfree(pi->name);
+                        pi->name = kmalloc(len + 1, GFP_NOFS);
+                        if (pi->name) {
+                                memcpy(pi->name, *p, len);
+                                pi->name[len] = '\0';
+                                dout("  name is %s\n", pi->name);
+                        }
+                }
+                *p += len;
+        }
+        return 0;
+bad:
+        return -EINVAL;
+}
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+        dout("osdmap_destroy %p\n", map);
+        if (map->crush)
+                crush_destroy(map->crush);
+        while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+                struct ceph_pg_mapping *pg =
+                        rb_entry(rb_first(&map->pg_temp),
+                                 struct ceph_pg_mapping, node);
+                rb_erase(&pg->node, &map->pg_temp);
+                kfree(pg);
+        }
+        while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+                struct ceph_pg_pool_info *pi =
+                        rb_entry(rb_first(&map->pg_pools),
+                                 struct ceph_pg_pool_info, node);
+                __remove_pg_pool(&map->pg_pools, pi);
+        }
+        kfree(map->osd_state);
+        kfree(map->osd_weight);
+        kfree(map->osd_addr);
+        kfree(map);
+}
+/*
+ * adjust max osd value.  reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+        u8 *state;
+        struct ceph_entity_addr *addr;
+        u32 *weight;
+        state = kcalloc(max, sizeof(*state), GFP_NOFS);
+        addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+        weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+        if (state == NULL || addr == NULL || weight == NULL) {
+                kfree(state);
+                kfree(addr);
+                kfree(weight);
+                return -ENOMEM;
+        }
+        /* copy old? */
+        if (map->osd_state) {
+                memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+                memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+                memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+                kfree(map->osd_state);
+                kfree(map->osd_addr);
+                kfree(map->osd_weight);
+        }
+        map->osd_state = state;
+        map->osd_weight = weight;
+        map->osd_addr = addr;
+        map->max_osd = max;
+        return 0;
+}
+/*
+ * decode a full map.
+ */
+struct ceph_osdmap *osdmap_decode(void **p, void *end)
+{
+        struct ceph_osdmap *map;
+        u16 version;
+        u32 len, max, i;
+        u8 ev;
+        int err = -EINVAL;
+        void *start = *p;
+        struct ceph_pg_pool_info *pi;
+        dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+        map = kzalloc(sizeof(*map), GFP_NOFS);
+        if (map == NULL)
+                return ERR_PTR(-ENOMEM);
+        map->pg_temp = RB_ROOT;
+        ceph_decode_16_safe(p, end, version, bad);
+        if (version > CEPH_OSDMAP_VERSION) {
+                pr_warning("got unknown v %d > %d of osdmap\n", version,
+                           CEPH_OSDMAP_VERSION);
+                goto bad;
+        }
+        ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+        ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+        map->epoch = ceph_decode_32(p);
+        ceph_decode_copy(p, &map->created, sizeof(map->created));
+        ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+        ceph_decode_32_safe(p, end, max, bad);
+        while (max--) {
+                ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+                pi = kzalloc(sizeof(*pi), GFP_NOFS);
+                if (!pi)
+                        goto bad;
+                pi->id = ceph_decode_32(p);
+                ev = ceph_decode_8(p); /* encoding version */
+                if (ev > CEPH_PG_POOL_VERSION) {
+                        pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+                                   ev, CEPH_PG_POOL_VERSION);
+                        goto bad;
+                }
+                __decode_pool(p, pi);
+                __insert_pg_pool(&map->pg_pools, pi);
+        }
+        if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+                goto bad;
+        ceph_decode_32_safe(p, end, map->pool_max, bad);
+        ceph_decode_32_safe(p, end, map->flags, bad);
+        max = ceph_decode_32(p);
+        /* (re)alloc osd arrays */
+        err = osdmap_set_max_osd(map, max);
+        if (err < 0)
+                goto bad;
+        dout("osdmap_decode max_osd = %d\n", map->max_osd);
+        /* osds */
+        err = -EINVAL;
+        ceph_decode_need(p, end, 3*sizeof(u32) +
+                         map->max_osd*(1 + sizeof(*map->osd_weight) +
+                                       sizeof(*map->osd_addr)), bad);
+        *p += 4; /* skip length field (should match max) */
+        ceph_decode_copy(p, map->osd_state, map->max_osd);
+        *p += 4; /* skip length field (should match max) */
+        for (i = 0; i < map->max_osd; i++)
+                map->osd_weight[i] = ceph_decode_32(p);
+        *p += 4; /* skip length field (should match max) */
+        ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+        for (i = 0; i < map->max_osd; i++)
+                ceph_decode_addr(&map->osd_addr[i]);
+        /* pg_temp */
+        ceph_decode_32_safe(p, end, len, bad);
+        for (i = 0; i < len; i++) {
+                int n, j;
+                struct ceph_pg pgid;
+                struct ceph_pg_mapping *pg;
+                ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
+                ceph_decode_copy(p, &pgid, sizeof(pgid));
+                n = ceph_decode_32(p);
+                ceph_decode_need(p, end, n * sizeof(u32), bad);
+                err = -ENOMEM;
+                pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
+                if (!pg)
+                        goto bad;
+                pg->pgid = pgid;
+                pg->len = n;
+                for (j = 0; j < n; j++)
+                        pg->osds[j] = ceph_decode_32(p);
+                err = __insert_pg_mapping(pg, &map->pg_temp);
+                if (err)
+                        goto bad;
+                dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
+        }
+        /* crush */
+        ceph_decode_32_safe(p, end, len, bad);
+        dout("osdmap_decode crush len %d from off 0x%x\n", len,
+             (int)(*p - start));
+        ceph_decode_need(p, end, len, bad);
+        map->crush = crush_decode(*p, end);
+        *p += len;
+        if (IS_ERR(map->crush)) {
+                err = PTR_ERR(map->crush);
+                map->crush = NULL;
+                goto bad;
+        }
+        /* ignore the rest of the map */
+        *p = end;
+        dout("osdmap_decode done %p %p\n", *p, end);
+        return map;
+bad:
+        dout("osdmap_decode fail\n");
+        ceph_osdmap_destroy(map);
+        return ERR_PTR(err);
+}
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+                                             struct ceph_osdmap *map,
+                                             struct ceph_messenger *msgr)
+{
+        struct crush_map *newcrush = NULL;
+        struct ceph_fsid fsid;
+        u32 epoch = 0;
+        struct ceph_timespec modified;
+        u32 len, pool;
+        __s32 new_pool_max, new_flags, max;
+        void *start = *p;
+        int err = -EINVAL;
+        u16 version;
+        struct rb_node *rbp;
+        ceph_decode_16_safe(p, end, version, bad);
+        if (version > CEPH_OSDMAP_INC_VERSION) {
+                pr_warning("got unknown v %d > %d of inc osdmap\n", version,
+                           CEPH_OSDMAP_INC_VERSION);
+                goto bad;
+        }
+        ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
+                         bad);
+        ceph_decode_copy(p, &fsid, sizeof(fsid));
+        epoch = ceph_decode_32(p);
+        BUG_ON(epoch != map->epoch+1);
+        ceph_decode_copy(p, &modified, sizeof(modified));
+        new_pool_max = ceph_decode_32(p);
+        new_flags = ceph_decode_32(p);
+        /* full map? */
+        ceph_decode_32_safe(p, end, len, bad);
+        if (len > 0) {
+                dout("apply_incremental full map len %d, %p to %p\n",
+                     len, *p, end);
+                return osdmap_decode(p, min(*p+len, end));
+        }
+        /* new crush? */
+        ceph_decode_32_safe(p, end, len, bad);
+        if (len > 0) {
+                dout("apply_incremental new crush map len %d, %p to %p\n",
+                     len, *p, end);
+                newcrush = crush_decode(*p, min(*p+len, end));
+                if (IS_ERR(newcrush))
+                        return ERR_PTR(PTR_ERR(newcrush));
+        }
+        /* new flags? */
+        if (new_flags >= 0)
+                map->flags = new_flags;
+        if (new_pool_max >= 0)
+                map->pool_max = new_pool_max;
+        ceph_decode_need(p, end, 5*sizeof(u32), bad);
+        /* new max? */
+        max = ceph_decode_32(p);
+        if (max >= 0) {
+                err = osdmap_set_max_osd(map, max);
+                if (err < 0)
+                        goto bad;
+        }
+        map->epoch++;
+        map->modified = map->modified;
+        if (newcrush) {
+                if (map->crush)
+                        crush_destroy(map->crush);
+                map->crush = newcrush;
+                newcrush = NULL;
+        }
+        /* new_pool */
+        ceph_decode_32_safe(p, end, len, bad);
+        while (len--) {
+                __u8 ev;
+                struct ceph_pg_pool_info *pi;
+                ceph_decode_32_safe(p, end, pool, bad);
+                ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
+                ev = ceph_decode_8(p);  /* encoding version */
+                if (ev > CEPH_PG_POOL_VERSION) {
+                        pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+                                   ev, CEPH_PG_POOL_VERSION);
+                        goto bad;
+                }
+                pi = __lookup_pg_pool(&map->pg_pools, pool);
+                if (!pi) {
+                        pi = kzalloc(sizeof(*pi), GFP_NOFS);
+                        if (!pi) {
+                                err = -ENOMEM;
+                                goto bad;
+                        }
+                        pi->id = pool;
+                        __insert_pg_pool(&map->pg_pools, pi);
+                }
+                __decode_pool(p, pi);
+        }
+        if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+                goto bad;
+        /* old_pool */
+        ceph_decode_32_safe(p, end, len, bad);
+        while (len--) {
+                struct ceph_pg_pool_info *pi;
+                ceph_decode_32_safe(p, end, pool, bad);
+                pi = __lookup_pg_pool(&map->pg_pools, pool);
+                if (pi)
+                        __remove_pg_pool(&map->pg_pools, pi);
+        }
+        /* new_up */
+        err = -EINVAL;
+        ceph_decode_32_safe(p, end, len, bad);
+        while (len--) {
+                u32 osd;
+                struct ceph_entity_addr addr;
+                ceph_decode_32_safe(p, end, osd, bad);
+                ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+                ceph_decode_addr(&addr);
+                pr_info("osd%d up\n", osd);
+                BUG_ON(osd >= map->max_osd);
+                map->osd_state[osd] |= CEPH_OSD_UP;
+                map->osd_addr[osd] = addr;
+        }
+        /* new_down */
+        ceph_decode_32_safe(p, end, len, bad);
+        while (len--) {
+                u32 osd;
+                ceph_decode_32_safe(p, end, osd, bad);
+                (*p)++;  /* clean flag */
+                pr_info("osd%d down\n", osd);
+                if (osd < map->max_osd)
+                        map->osd_state[osd] &= ~CEPH_OSD_UP;
+        }
+        /* new_weight */
+        ceph_decode_32_safe(p, end, len, bad);
+        while (len--) {
+                u32 osd, off;
+                ceph_decode_need(p, end, sizeof(u32)*2, bad);
+                osd = ceph_decode_32(p);
+                off = ceph_decode_32(p);
+                pr_info("osd%d weight 0x%x %s\n", osd, off,
+                     off == CEPH_OSD_IN ? "(in)" :
+                     (off == CEPH_OSD_OUT ? "(out)" : ""));
+                if (osd < map->max_osd)
+                        map->osd_weight[osd] = off;
+        }
+        /* new_pg_temp */
+        rbp = rb_first(&map->pg_temp);
+        ceph_decode_32_safe(p, end, len, bad);
+        while (len--) {
+                struct ceph_pg_mapping *pg;
+                int j;
+                struct ceph_pg pgid;
+                u32 pglen;
+                ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+                ceph_decode_copy(p, &pgid, sizeof(pgid));
+                pglen = ceph_decode_32(p);
+                /* remove any? */
+                while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
+                                                node)->pgid, pgid) <= 0) {
+                        struct rb_node *cur = rbp;
+                        rbp = rb_next(rbp);
+                        dout(" removed pg_temp %llx\n",
+                             *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+                                               node)->pgid);
+                        rb_erase(cur, &map->pg_temp);
+                }
+                if (pglen) {
+                        /* insert */
+                        ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+                        pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
+                        if (!pg) {
+                                err = -ENOMEM;
+                                goto bad;
+                        }
+                        pg->pgid = pgid;
+                        pg->len = pglen;
+                        for (j = 0; j < pglen; j++)
+                                pg->osds[j] = ceph_decode_32(p);
+                        err = __insert_pg_mapping(pg, &map->pg_temp);
+                        if (err)
+                                goto bad;
+                        dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
+                             pglen);
+                }
+        }
+        while (rbp) {
+                struct rb_node *cur = rbp;
+                rbp = rb_next(rbp);
+                dout(" removed pg_temp %llx\n",
+                     *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+                                       node)->pgid);
+                rb_erase(cur, &map->pg_temp);
+        }
+        /* ignore the rest */
+        *p = end;
+        return map;
+bad:
+        pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
+               epoch, (int)(*p - start), *p, start, end);
+        print_hex_dump(KERN_DEBUG, "osdmap: ",
+                       DUMP_PREFIX_OFFSET, 16, 1,
+                       start, end - start, true);
+        if (newcrush)
+                crush_destroy(newcrush);
+        return ERR_PTR(err);
+}
+/*
+ * calculate file layout from given offset, length.
+ * fill in correct oid, logical length, and object extent
+ * offset, length.
+ *
+ * for now, we write only a single su, until we can
+ * pass a stride back to the caller.
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+                                   u64 off, u64 *plen,
+                                   u64 *ono,
+                                   u64 *oxoff, u64 *oxlen)
+{
+        u32 osize = le32_to_cpu(layout->fl_object_size);
+        u32 su = le32_to_cpu(layout->fl_stripe_unit);
+        u32 sc = le32_to_cpu(layout->fl_stripe_count);
+        u32 bl, stripeno, stripepos, objsetno;
+        u32 su_per_object;
+        u64 t, su_offset;
+        dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
+             osize, su);
+        su_per_object = osize / su;
+        dout("osize %u / su %u = su_per_object %u\n", osize, su,
+             su_per_object);
+        BUG_ON((su & ~PAGE_MASK) != 0);
+        /* bl = *off / su; */
+        t = off;
+        do_div(t, su);
+        bl = t;
+        dout("off %llu / su %u = bl %u\n", off, su, bl);
+        stripeno = bl / sc;
+        stripepos = bl % sc;
+        objsetno = stripeno / su_per_object;
+        *ono = objsetno * sc + stripepos;
+        dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
+        /* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
+        t = off;
+        su_offset = do_div(t, su);
+        *oxoff = su_offset + (stripeno % su_per_object) * su;
+        /*
+         * Calculate the length of the extent being written to the selected
+         * object. This is the minimum of the full length requested (plen) or
+         * the remainder of the current stripe being written to.
+         */
+        *oxlen = min_t(u64, *plen, su - su_offset);
+        *plen = *oxlen;
+        dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+}
+/*
+ * calculate an object layout (i.e. pgid) from an oid,
+ * file_layout, and osdmap
+ */
+int ceph_calc_object_layout(struct ceph_object_layout *ol,
+                            const char *oid,
+                            struct ceph_file_layout *fl,
+                            struct ceph_osdmap *osdmap)
+{
+        unsigned num, num_mask;
+        struct ceph_pg pgid;
+        s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
+        int poolid = le32_to_cpu(fl->fl_pg_pool);
+        struct ceph_pg_pool_info *pool;
+        unsigned ps;
+        BUG_ON(!osdmap);
+        pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+        if (!pool)
+                return -EIO;
+        ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
+        if (preferred >= 0) {
+                ps += preferred;
+                num = le32_to_cpu(pool->v.lpg_num);
+                num_mask = pool->lpg_num_mask;
+        } else {
+                num = le32_to_cpu(pool->v.pg_num);
+                num_mask = pool->pg_num_mask;
+        }
+        pgid.ps = cpu_to_le16(ps);
+        pgid.preferred = cpu_to_le16(preferred);
+        pgid.pool = fl->fl_pg_pool;
+        if (preferred >= 0)
+                dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
+                     (int)preferred);
+        else
+                dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+        ol->ol_pgid = pgid;
+        ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+        return 0;
+}
+/*
+ * Calculate raw osd vector for the given pgid.  Return pointer to osd
+ * array, or NULL on failure.
+ */
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                        int *osds, int *num)
+{
+        struct ceph_pg_mapping *pg;
+        struct ceph_pg_pool_info *pool;
+        int ruleno;
+        unsigned poolid, ps, pps;
+        int preferred;
+        /* pg_temp? */
+        pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+        if (pg) {
+                *num = pg->len;
+                return pg->osds;
+        }
+        /* crush */
+        poolid = le32_to_cpu(pgid.pool);
+        ps = le16_to_cpu(pgid.ps);
+        preferred = (s16)le16_to_cpu(pgid.preferred);
+        /* don't forcefeed bad device ids to crush */
+        if (preferred >= osdmap->max_osd ||
+            preferred >= osdmap->crush->max_devices)
+                preferred = -1;
+        pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+        if (!pool)
+                return NULL;
+        ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
+                                 pool->v.type, pool->v.size);
+        if (ruleno < 0) {
+                pr_err("no crush rule pool %d type %d size %d\n",
+                       poolid, pool->v.type, pool->v.size);
+                return NULL;
+        }
+        if (preferred >= 0)
+                pps = ceph_stable_mod(ps,
+                                      le32_to_cpu(pool->v.lpgp_num),
+                                      pool->lpgp_num_mask);
+        else
+                pps = ceph_stable_mod(ps,
+                                      le32_to_cpu(pool->v.pgp_num),
+                                      pool->pgp_num_mask);
+        pps += poolid;
+        *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
+                             min_t(int, pool->v.size, *num),
+                             preferred, osdmap->osd_weight);
+        return osds;
+}
+/*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                        int *acting)
+{
+        int rawosds[CEPH_PG_MAX_SIZE], *osds;
+        int i, o, num = CEPH_PG_MAX_SIZE;
+        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+        if (!osds)
+                return -1;
+        /* primary is first up osd */
+        o = 0;
+        for (i = 0; i < num; i++)
+                if (ceph_osd_is_up(osdmap, osds[i]))
+                        acting[o++] = osds[i];
+        return o;
+}
+/*
+ * Return primary osd for given pgid, or -1 if none.
+ */
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+{
+        int rawosds[CEPH_PG_MAX_SIZE], *osds;
+        int i, num = CEPH_PG_MAX_SIZE;
+        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+        if (!osds)
+                return -1;
+        /* primary is first up osd */
+        for (i = 0; i < num; i++)
+                if (ceph_osd_is_up(osdmap, osds[i]))
+                        return osds[i];
+        return -1;
+}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..970b547e510d
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,128 @@
+#ifndef _FS_CEPH_OSDMAP_H
+#define _FS_CEPH_OSDMAP_H
+#include <linux/rbtree.h>
+#include "types.h"
+#include "ceph_fs.h"
+#include "crush/crush.h"
+/*
+ * The osd map describes the current membership of the osd cluster and
+ * specifies the mapping of objects to placement groups and placement
+ * groups to (sets of) osds.  That is, it completely specifies the
+ * (desired) distribution of all data objects in the system at some
+ * point in time.
+ *
+ * Each map version is identified by an epoch, which increases monotonically.
+ *
+ * The map can be updated either via an incremental map (diff) describing
+ * the change between two successive epochs, or as a fully encoded map.
+ */
+struct ceph_pg_pool_info {
+        struct rb_node node;
+        int id;
+        struct ceph_pg_pool v;
+        int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+        char *name;
+};
+struct ceph_pg_mapping {
+        struct rb_node node;
+        struct ceph_pg pgid;
+        int len;
+        int osds[];
+};
+struct ceph_osdmap {
+        struct ceph_fsid fsid;
+        u32 epoch;
+        u32 mkfs_epoch;
+        struct ceph_timespec created, modified;
+        u32 flags;         /* CEPH_OSDMAP_* */
+        u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
+        u8 *osd_state;     /* CEPH_OSD_* */
+        u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
+        struct ceph_entity_addr *osd_addr;
+        struct rb_root pg_temp;
+        struct rb_root pg_pools;
+        u32 pool_max;
+        /* the CRUSH map specifies the mapping of placement groups to
+         * the list of osds that store+replicate them. */
+        struct crush_map *crush;
+};
+/*
+ * file layout helpers
+ */
+#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
+#define ceph_file_layout_stripe_count(l) \
+        ((__s32)le32_to_cpu((l).fl_stripe_count))
+#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
+#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
+#define ceph_file_layout_object_su(l) \
+        ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
+#define ceph_file_layout_pg_preferred(l) \
+        ((__s32)le32_to_cpu((l).fl_pg_preferred))
+#define ceph_file_layout_pg_pool(l) \
+        ((__s32)le32_to_cpu((l).fl_pg_pool))
+static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
+{
+        return le32_to_cpu(l->fl_stripe_unit) *
+                le32_to_cpu(l->fl_stripe_count);
+}
+/* "period" == bytes before i start on a new set of objects */
+static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
+{
+        return le32_to_cpu(l->fl_object_size) *
+                le32_to_cpu(l->fl_stripe_count);
+}
+static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+{
+        return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
+}
+static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
+{
+        return map && (map->flags & flag);
+}
+extern char *ceph_osdmap_state_str(char *str, int len, int state);
+static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
+                                                     int osd)
+{
+        if (osd >= map->max_osd)
+                return NULL;
+        return &map->osd_addr[osd];
+}
+extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
+extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+                                            struct ceph_osdmap *map,
+                                            struct ceph_messenger *msgr);
+extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+/* calculate mapping of a file extent to an object */
+extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+                                          u64 off, u64 *plen,
+                                          u64 *bno, u64 *oxoff, u64 *oxlen);
+/* calculate mapping of object to a placement group */
+extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
+                                   const char *oid,
+                                   struct ceph_file_layout *fl,
+                                   struct ceph_osdmap *osdmap);
+extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                               int *acting);
+extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
+                                struct ceph_pg pgid);
+#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..5f8dbf7c745a
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,55 @@
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include "pagelist.h"
+int ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+        if (pl->mapped_tail)
+                kunmap(pl->mapped_tail);
+        while (!list_empty(&pl->head)) {
+                struct page *page = list_first_entry(&pl->head, struct page,
+                                                     lru);
+                list_del(&page->lru);
+                __free_page(page);
+        }
+        return 0;
+}
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+        struct page *page = alloc_page(GFP_NOFS);
+        if (!page)
+                return -ENOMEM;
+        pl->room += PAGE_SIZE;
+        list_add_tail(&page->lru, &pl->head);
+        if (pl->mapped_tail)
+                kunmap(pl->mapped_tail);
+        pl->mapped_tail = kmap(page);
+        return 0;
+}
+int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
+{
+        while (pl->room < len) {
+                size_t bit = pl->room;
+                int ret;
+                memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+                       buf, bit);
+                pl->length += bit;
+                pl->room -= bit;
+                buf += bit;
+                len -= bit;
+                ret = ceph_pagelist_addpage(pl);
+                if (ret)
+                        return ret;
+        }
+        memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+        pl->length += len;
+        pl->room -= len;
+        return 0;
+}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
+#ifndef __FS_CEPH_PAGELIST_H
+#define __FS_CEPH_PAGELIST_H
+#include <linux/list.h>
+struct ceph_pagelist {
+        struct list_head head;
+        void *mapped_tail;
+        size_t length;
+        size_t room;
+};
+static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
+{
+        INIT_LIST_HEAD(&pl->head);
+        pl->mapped_tail = NULL;
+        pl->length = 0;
+        pl->room = 0;
+}
+extern int ceph_pagelist_release(struct ceph_pagelist *pl);
+extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
+static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
+{
+        __le64 ev = cpu_to_le64(v);
+        return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
+{
+        __le32 ev = cpu_to_le32(v);
+        return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
+{
+        __le16 ev = cpu_to_le16(v);
+        return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
+{
+        return ceph_pagelist_append(pl, &v, 1);
+}
+static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
+                                              char *s, size_t len)
+{
+        int ret = ceph_pagelist_encode_32(pl, len);
+        if (ret)
+                return ret;
+        if (len)
+                return ceph_pagelist_append(pl, s, len);
+        return 0;
+}
+#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..fd56451a871f
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,377 @@
+#ifndef __RADOS_H
+#define __RADOS_H
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+#include "msgr.h"
+/*
+ * osdmap encoding versions
+ */
+#define CEPH_OSDMAP_INC_VERSION     5
+#define CEPH_OSDMAP_INC_VERSION_EXT 5
+#define CEPH_OSDMAP_VERSION         5
+#define CEPH_OSDMAP_VERSION_EXT     5
+/*
+ * fs id
+ */
+struct ceph_fsid {
+        unsigned char fsid[16];
+};
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+                                    const struct ceph_fsid *b)
+{
+        return memcmp(a, b, sizeof(*a));
+}
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
+struct ceph_timespec {
+        __le32 tv_sec;
+        __le32 tv_nsec;
+} __attribute__ ((packed));
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH     1
+#define CEPH_OBJECT_LAYOUT_LINEAR   2
+#define CEPH_OBJECT_LAYOUT_HASHINO  3
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH  0
+#define CEPH_PG_LAYOUT_HASH   1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+        __le16 preferred; /* preferred primary osd */
+        __le16 ps;        /* placement seed */
+        __le32 pool;      /* object pool */
+} __attribute__ ((packed));
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ *  pg_num -- base number of pseudorandomly placed pgs
+ *
+ *  pgp_num -- effective number when calculating pg placement.  this
+ * is used for pg_num increases.  new pgs result in data being "split"
+ * into new pgs.  for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split.  only _then_ are the new
+ * pgs placed independently.
+ *
+ *  lpg_num -- localized pg count (per device).  replicas are randomly
+ * selected.
+ *
+ *  lpgp_num -- as above.
+ */
+#define CEPH_PG_TYPE_REP     1
+#define CEPH_PG_TYPE_RAID4   2
+#define CEPH_PG_POOL_VERSION 2
+struct ceph_pg_pool {
+        __u8 type;                /* CEPH_PG_TYPE_* */
+        __u8 size;                /* number of osds in each pg */
+        __u8 crush_ruleset;       /* crush placement rule */
+        __u8 object_hash;         /* hash mapping object name to ps */
+        __le32 pg_num, pgp_num;   /* number of pg's */
+        __le32 lpg_num, lpgp_num; /* number of localized pg's */
+        __le32 last_change;       /* most recent epoch changed */
+        __le64 snap_seq;          /* seq for per-pool snapshot */
+        __le32 snap_epoch;        /* epoch of last snap */
+        __le32 num_snaps;
+        __le32 num_removed_snap_intervals;
+        __le64 uid;
+} __attribute__ ((packed));
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time.  b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+        if ((x & bmask) < b)
+                return x & bmask;
+        else
+                return x & (bmask >> 1);
+}
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+        struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
+        __le32 ol_stripe_unit;    /* for per-object parity, if any */
+} __attribute__ ((packed));
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+        __le32 epoch;
+        __le64 version;
+} __attribute__ ((packed));
+/*
+ * osd map bits
+ */
+/* status bits */
+#define CEPH_OSD_EXISTS 1
+#define CEPH_OSD_UP     2
+/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN  0x10000
+#define CEPH_OSD_OUT 0
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
+/*
+ * osd ops
+ */
+#define CEPH_OSD_OP_MODE       0xf000
+#define CEPH_OSD_OP_MODE_RD    0x1000
+#define CEPH_OSD_OP_MODE_WR    0x2000
+#define CEPH_OSD_OP_MODE_RMW   0x3000
+#define CEPH_OSD_OP_MODE_SUB   0x4000
+#define CEPH_OSD_OP_TYPE       0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK  0x0100
+#define CEPH_OSD_OP_TYPE_DATA  0x0200
+#define CEPH_OSD_OP_TYPE_ATTR  0x0300
+#define CEPH_OSD_OP_TYPE_EXEC  0x0400
+#define CEPH_OSD_OP_TYPE_PG    0x0500
+enum {
+        /** data **/
+        /* read */
+        CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
+        CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
+        /* fancy read */
+        CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
+        /* write */
+        CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
+        CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
+        CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
+        CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
+        CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
+        /* fancy write */
+        CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
+        CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
+        CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
+        CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
+        CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
+        CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
+        CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
+        CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
+        /** attrs **/
+        /* read */
+        CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
+        CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+        /* write */
+        CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
+        CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
+        CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
+        CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
+        /** subop **/
+        CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
+        CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
+        CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
+        CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
+        CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
+        /** lock **/
+        CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
+        CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
+        CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
+        CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
+        CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
+        CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
+        /** exec **/
+        CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
+        /** pg **/
+        CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
+};
+static inline int ceph_osd_op_type_lock(int op)
+{
+        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+static inline int ceph_osd_op_mode_subop(int op)
+{
+        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+}
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_RM  'r'
+extern const char *ceph_osd_op_name(int op);
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+        CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
+        CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
+        CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
+        CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
+        CEPH_OSD_FLAG_READ = 16,        /* op may read */
+        CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
+        CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
+        CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
+        CEPH_OSD_FLAG_BALANCE_READS = 256,
+        CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
+        CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
+        CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
+};
+enum {
+        CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
+};
+#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+/*
+ * an individual object operation.  each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+        __le16 op;           /* CEPH_OSD_OP_* */
+        __le32 flags;        /* CEPH_OSD_FLAG_* */
+        union {
+                struct {
+                        __le64 offset, length;
+                        __le64 truncate_size;
+                        __le32 truncate_seq;
+                } __attribute__ ((packed)) extent;
+                struct {
+                        __le32 name_len;
+                        __le32 value_len;
+                } __attribute__ ((packed)) xattr;
+                struct {
+                        __u8 class_len;
+                        __u8 method_len;
+                        __u8 argc;
+                        __le32 indata_len;
+                } __attribute__ ((packed)) cls;
+                struct {
+                        __le64 cookie, count;
+                } __attribute__ ((packed)) pgls;
+        };
+        __le32 payload_len;
+} __attribute__ ((packed));
+/*
+ * osd request message header.  each request may include multiple
+ * ceph_osd_op object operations.
+ */
+struct ceph_osd_request_head {
+        __le32 client_inc;                 /* client incarnation */
+        struct ceph_object_layout layout;  /* pgid */
+        __le32 osdmap_epoch;               /* client's osdmap epoch */
+        __le32 flags;
+        struct ceph_timespec mtime;        /* for mutations only */
+        struct ceph_eversion reassert_version; /* if we are replaying op */
+        __le32 object_len;     /* length of object name */
+        __le64 snapid;         /* snapid to read */
+        __le64 snap_seq;       /* writer's snap context */
+        __le32 num_snaps;
+        __le16 num_ops;
+        struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
+} __attribute__ ((packed));
+struct ceph_osd_reply_head {
+        __le32 client_inc;                /* client incarnation */
+        __le32 flags;
+        struct ceph_object_layout layout;
+        __le32 osdmap_epoch;
+        struct ceph_eversion reassert_version; /* for replaying uncommitted */
+        __le32 result;                    /* result code */
+        __le32 object_len;                /* length of object name */
+        __le32 num_ops;
+        struct ceph_osd_op ops[0];  /* ops[], object */
+} __attribute__ ((packed));
+#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..d5114db70453
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,911 @@
+#include "ceph_debug.h"
+#include <linux/sort.h>
+#include <linux/slab.h>
+#include "super.h"
+#include "decode.h"
+/*
+ * Snapshots in ceph are driven in large part by cooperation from the
+ * client.  In contrast to local file systems or file servers that
+ * implement snapshots at a single point in the system, ceph's
+ * distributed access to storage requires clients to help decide
+ * whether a write logically occurs before or after a recently created
+ * snapshot.
+ *
+ * This provides a perfect instantanous client-wide snapshot.  Between
+ * clients, however, snapshots may appear to be applied at slightly
+ * different points in time, depending on delays in delivering the
+ * snapshot notification.
+ *
+ * Snapshots are _not_ file system-wide.  Instead, each snapshot
+ * applies to the subdirectory nested beneath some directory.  This
+ * effectively divides the hierarchy into multiple "realms," where all
+ * of the files contained by each realm share the same set of
+ * snapshots.  An individual realm's snap set contains snapshots
+ * explicitly created on that realm, as well as any snaps in its
+ * parent's snap set _after_ the point at which the parent became it's
+ * parent (due to, say, a rename).  Similarly, snaps from prior parents
+ * during the time intervals during which they were the parent are included.
+ *
+ * The client is spared most of this detail, fortunately... it must only
+ * maintains a hierarchy of realms reflecting the current parent/child
+ * realm relationship, and for each realm has an explicit list of snaps
+ * inherited from prior parents.
+ *
+ * A snap_realm struct is maintained for realms containing every inode
+ * with an open cap in the system.  (The needed snap realm information is
+ * provided by the MDS whenever a cap is issued, i.e., on open.)  A 'seq'
+ * version number is used to ensure that as realm parameters change (new
+ * snapshot, new parent, etc.) the client's realm hierarchy is updated.
+ *
+ * The realm hierarchy drives the generation of a 'snap context' for each
+ * realm, which simply lists the resulting set of snaps for the realm.  This
+ * is attached to any writes sent to OSDs.
+ */
+/*
+ * Unfortunately error handling is a bit mixed here.  If we get a snap
+ * update, but don't have enough memory to update our realm hierarchy,
+ * it's not clear what we can do about it (besides complaining to the
+ * console).
+ */
+/*
+ * increase ref count for the realm
+ *
+ * caller must hold snap_rwsem for write.
+ */
+void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+                         struct ceph_snap_realm *realm)
+{
+        dout("get_realm %p %d -> %d\n", realm,
+             atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+        /*
+         * since we _only_ increment realm refs or empty the empty
+         * list with snap_rwsem held, adjusting the empty list here is
+         * safe.  we do need to protect against concurrent empty list
+         * additions, however.
+         */
+        if (atomic_read(&realm->nref) == 0) {
+                spin_lock(&mdsc->snap_empty_lock);
+                list_del_init(&realm->empty_item);
+                spin_unlock(&mdsc->snap_empty_lock);
+        }
+        atomic_inc(&realm->nref);
+}
+static void __insert_snap_realm(struct rb_root *root,
+                                struct ceph_snap_realm *new)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_snap_realm *r = NULL;
+        while (*p) {
+                parent = *p;
+                r = rb_entry(parent, struct ceph_snap_realm, node);
+                if (new->ino < r->ino)
+                        p = &(*p)->rb_left;
+                else if (new->ino > r->ino)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&new->node, parent, p);
+        rb_insert_color(&new->node, root);
+}
+/*
+ * create and get the realm rooted at @ino and bump its ref count.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static struct ceph_snap_realm *ceph_create_snap_realm(
+        struct ceph_mds_client *mdsc,
+        u64 ino)
+{
+        struct ceph_snap_realm *realm;
+        realm = kzalloc(sizeof(*realm), GFP_NOFS);
+        if (!realm)
+                return ERR_PTR(-ENOMEM);
+        atomic_set(&realm->nref, 0);    /* tree does not take a ref */
+        realm->ino = ino;
+        INIT_LIST_HEAD(&realm->children);
+        INIT_LIST_HEAD(&realm->child_item);
+        INIT_LIST_HEAD(&realm->empty_item);
+        INIT_LIST_HEAD(&realm->inodes_with_caps);
+        spin_lock_init(&realm->inodes_with_caps_lock);
+        __insert_snap_realm(&mdsc->snap_realms, realm);
+        dout("create_snap_realm %llx %p\n", realm->ino, realm);
+        return realm;
+}
+/*
+ * lookup the realm rooted at @ino.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+                                               u64 ino)
+{
+        struct rb_node *n = mdsc->snap_realms.rb_node;
+        struct ceph_snap_realm *r;
+        while (n) {
+                r = rb_entry(n, struct ceph_snap_realm, node);
+                if (ino < r->ino)
+                        n = n->rb_left;
+                else if (ino > r->ino)
+                        n = n->rb_right;
+                else {
+                        dout("lookup_snap_realm %llx %p\n", r->ino, r);
+                        return r;
+                }
+        }
+        return NULL;
+}
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+                             struct ceph_snap_realm *realm);
+/*
+ * called with snap_rwsem (write)
+ */
+static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
+                                 struct ceph_snap_realm *realm)
+{
+        dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+        rb_erase(&realm->node, &mdsc->snap_realms);
+        if (realm->parent) {
+                list_del_init(&realm->child_item);
+                __put_snap_realm(mdsc, realm->parent);
+        }
+        kfree(realm->prior_parent_snaps);
+        kfree(realm->snaps);
+        ceph_put_snap_context(realm->cached_context);
+        kfree(realm);
+}
+/*
+ * caller holds snap_rwsem (write)
+ */
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+                             struct ceph_snap_realm *realm)
+{
+        dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+             atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+        if (atomic_dec_and_test(&realm->nref))
+                __destroy_snap_realm(mdsc, realm);
+}
+/*
+ * caller needn't hold any locks
+ */
+void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+                         struct ceph_snap_realm *realm)
+{
+        dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+             atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+        if (!atomic_dec_and_test(&realm->nref))
+                return;
+        if (down_write_trylock(&mdsc->snap_rwsem)) {
+                __destroy_snap_realm(mdsc, realm);
+                up_write(&mdsc->snap_rwsem);
+        } else {
+                spin_lock(&mdsc->snap_empty_lock);
+                list_add(&mdsc->snap_empty, &realm->empty_item);
+                spin_unlock(&mdsc->snap_empty_lock);
+        }
+}
+/*
+ * Clean up any realms whose ref counts have dropped to zero.  Note
+ * that this does not include realms who were created but not yet
+ * used.
+ *
+ * Called under snap_rwsem (write)
+ */
+static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+        struct ceph_snap_realm *realm;
+        spin_lock(&mdsc->snap_empty_lock);
+        while (!list_empty(&mdsc->snap_empty)) {
+                realm = list_first_entry(&mdsc->snap_empty,
+                                   struct ceph_snap_realm, empty_item);
+                list_del(&realm->empty_item);
+                spin_unlock(&mdsc->snap_empty_lock);
+                __destroy_snap_realm(mdsc, realm);
+                spin_lock(&mdsc->snap_empty_lock);
+        }
+        spin_unlock(&mdsc->snap_empty_lock);
+}
+void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+        down_write(&mdsc->snap_rwsem);
+        __cleanup_empty_realms(mdsc);
+        up_write(&mdsc->snap_rwsem);
+}
+/*
+ * adjust the parent realm of a given @realm.  adjust child list, and parent
+ * pointers, and ref counts appropriately.
+ *
+ * return true if parent was changed, 0 if unchanged, <0 on error.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
+                                    struct ceph_snap_realm *realm,
+                                    u64 parentino)
+{
+        struct ceph_snap_realm *parent;
+        if (realm->parent_ino == parentino)
+                return 0;
+        parent = ceph_lookup_snap_realm(mdsc, parentino);
+        if (!parent) {
+                parent = ceph_create_snap_realm(mdsc, parentino);
+                if (IS_ERR(parent))
+                        return PTR_ERR(parent);
+        }
+        dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
+             realm->ino, realm, realm->parent_ino, realm->parent,
+             parentino, parent);
+        if (realm->parent) {
+                list_del_init(&realm->child_item);
+                ceph_put_snap_realm(mdsc, realm->parent);
+        }
+        realm->parent_ino = parentino;
+        realm->parent = parent;
+        ceph_get_snap_realm(mdsc, parent);
+        list_add(&realm->child_item, &parent->children);
+        return 1;
+}
+static int cmpu64_rev(const void *a, const void *b)
+{
+        if (*(u64 *)a < *(u64 *)b)
+                return 1;
+        if (*(u64 *)a > *(u64 *)b)
+                return -1;
+        return 0;
+}
+/*
+ * build the snap context for a given realm.
+ */
+static int build_snap_context(struct ceph_snap_realm *realm)
+{
+        struct ceph_snap_realm *parent = realm->parent;
+        struct ceph_snap_context *snapc;
+        int err = 0;
+        int i;
+        int num = realm->num_prior_parent_snaps + realm->num_snaps;
+        /*
+         * build parent context, if it hasn't been built.
+         * conservatively estimate that all parent snaps might be
+         * included by us.
+         */
+        if (parent) {
+                if (!parent->cached_context) {
+                        err = build_snap_context(parent);
+                        if (err)
+                                goto fail;
+                }
+                num += parent->cached_context->num_snaps;
+        }
+        /* do i actually need to update?  not if my context seq
+           matches realm seq, and my parents' does to.  (this works
+           because we rebuild_snap_realms() works _downward_ in
+           hierarchy after each update.) */
+        if (realm->cached_context &&
+            realm->cached_context->seq == realm->seq &&
+            (!parent ||
+             realm->cached_context->seq >= parent->cached_context->seq)) {
+                dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
+                     " (unchanged)\n",
+                     realm->ino, realm, realm->cached_context,
+                     realm->cached_context->seq,
+                     realm->cached_context->num_snaps);
+                return 0;
+        }
+        /* alloc new snap context */
+        err = -ENOMEM;
+        if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
+                goto fail;
+        snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
+        if (!snapc)
+                goto fail;
+        atomic_set(&snapc->nref, 1);
+        /* build (reverse sorted) snap vector */
+        num = 0;
+        snapc->seq = realm->seq;
+        if (parent) {
+                /* include any of parent's snaps occuring _after_ my
+                   parent became my parent */
+                for (i = 0; i < parent->cached_context->num_snaps; i++)
+                        if (parent->cached_context->snaps[i] >=
+                            realm->parent_since)
+                                snapc->snaps[num++] =
+                                        parent->cached_context->snaps[i];
+                if (parent->cached_context->seq > snapc->seq)
+                        snapc->seq = parent->cached_context->seq;
+        }
+        memcpy(snapc->snaps + num, realm->snaps,
+               sizeof(u64)*realm->num_snaps);
+        num += realm->num_snaps;
+        memcpy(snapc->snaps + num, realm->prior_parent_snaps,
+               sizeof(u64)*realm->num_prior_parent_snaps);
+        num += realm->num_prior_parent_snaps;
+        sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
+        snapc->num_snaps = num;
+        dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
+             realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
+        if (realm->cached_context)
+                ceph_put_snap_context(realm->cached_context);
+        realm->cached_context = snapc;
+        return 0;
+fail:
+        /*
+         * if we fail, clear old (incorrect) cached_context... hopefully
+         * we'll have better luck building it later
+         */
+        if (realm->cached_context) {
+                ceph_put_snap_context(realm->cached_context);
+                realm->cached_context = NULL;
+        }
+        pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
+               realm, err);
+        return err;
+}
+/*
+ * rebuild snap context for the given realm and all of its children.
+ */
+static void rebuild_snap_realms(struct ceph_snap_realm *realm)
+{
+        struct ceph_snap_realm *child;
+        dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
+        build_snap_context(realm);
+        list_for_each_entry(child, &realm->children, child_item)
+                rebuild_snap_realms(child);
+}
+/*
+ * helper to allocate and decode an array of snapids.  free prior
+ * instance, if any.
+ */
+static int dup_array(u64 **dst, __le64 *src, int num)
+{
+        int i;
+        kfree(*dst);
+        if (num) {
+                *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
+                if (!*dst)
+                        return -ENOMEM;
+                for (i = 0; i < num; i++)
+                        (*dst)[i] = get_unaligned_le64(src + i);
+        } else {
+                *dst = NULL;
+        }
+        return 0;
+}
+/*
+ * When a snapshot is applied, the size/mtime inode metadata is queued
+ * in a ceph_cap_snap (one for each snapshot) until writeback
+ * completes and the metadata can be flushed back to the MDS.
+ *
+ * However, if a (sync) write is currently in-progress when we apply
+ * the snapshot, we have to wait until the write succeeds or fails
+ * (and a final size/mtime is known).  In this case the
+ * cap_snap->writing = 1, and is said to be "pending."  When the write
+ * finishes, we __ceph_finish_cap_snap().
+ *
+ * Caller must hold snap_rwsem for read (i.e., the realm topology won't
+ * change).
+ */
+void ceph_queue_cap_snap(struct ceph_inode_info *ci)
+{
+        struct inode *inode = &ci->vfs_inode;
+        struct ceph_cap_snap *capsnap;
+        int used;
+        capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
+        if (!capsnap) {
+                pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
+                return;
+        }
+        spin_lock(&inode->i_lock);
+        used = __ceph_caps_used(ci);
+        if (__ceph_have_pending_cap_snap(ci)) {
+                /* there is no point in queuing multiple "pending" cap_snaps,
+                   as no new writes are allowed to start when pending, so any
+                   writes in progress now were started before the previous
+                   cap_snap.  lucky us. */
+                dout("queue_cap_snap %p already pending\n", inode);
+                kfree(capsnap);
+        } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+                struct ceph_snap_context *snapc = ci->i_head_snapc;
+                igrab(inode);
+                atomic_set(&capsnap->nref, 1);
+                capsnap->ci = ci;
+                INIT_LIST_HEAD(&capsnap->ci_item);
+                INIT_LIST_HEAD(&capsnap->flushing_item);
+                capsnap->follows = snapc->seq - 1;
+                capsnap->issued = __ceph_caps_issued(ci, NULL);
+                capsnap->dirty = __ceph_caps_dirty(ci);
+                capsnap->mode = inode->i_mode;
+                capsnap->uid = inode->i_uid;
+                capsnap->gid = inode->i_gid;
+                /* fixme? */
+                capsnap->xattr_blob = NULL;
+                capsnap->xattr_len = 0;
+                /* dirty page count moved from _head to this cap_snap;
+                   all subsequent writes page dirties occur _after_ this
+                   snapshot. */
+                capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+                ci->i_wrbuffer_ref_head = 0;
+                capsnap->context = snapc;
+                ci->i_head_snapc = NULL;
+                list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+                if (used & CEPH_CAP_FILE_WR) {
+                        dout("queue_cap_snap %p cap_snap %p snapc %p"
+                             " seq %llu used WR, now pending\n", inode,
+                             capsnap, snapc, snapc->seq);
+                        capsnap->writing = 1;
+                } else {
+                        /* note mtime, size NOW. */
+                        __ceph_finish_cap_snap(ci, capsnap);
+                }
+        } else {
+                dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+                kfree(capsnap);
+        }
+        spin_unlock(&inode->i_lock);
+}
+/*
+ * Finalize the size, mtime for a cap_snap.. that is, settle on final values
+ * to be used for the snapshot, to be flushed back to the mds.
+ *
+ * If capsnap can now be flushed, add to snap_flush list, and return 1.
+ *
+ * Caller must hold i_lock.
+ */
+int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+                            struct ceph_cap_snap *capsnap)
+{
+        struct inode *inode = &ci->vfs_inode;
+        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        BUG_ON(capsnap->writing);
+        capsnap->size = inode->i_size;
+        capsnap->mtime = inode->i_mtime;
+        capsnap->atime = inode->i_atime;
+        capsnap->ctime = inode->i_ctime;
+        capsnap->time_warp_seq = ci->i_time_warp_seq;
+        if (capsnap->dirty_pages) {
+                dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
+                     "still has %d dirty pages\n", inode, capsnap,
+                     capsnap->context, capsnap->context->seq,
+                     ceph_cap_string(capsnap->dirty), capsnap->size,
+                     capsnap->dirty_pages);
+                return 0;
+        }
+        dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
+             inode, capsnap, capsnap->context,
+             capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+             capsnap->size);
+        spin_lock(&mdsc->snap_flush_lock);
+        list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+        spin_unlock(&mdsc->snap_flush_lock);
+        return 1;  /* caller may want to ceph_flush_snaps */
+}
+/*
+ * Parse and apply a snapblob "snap trace" from the MDS.  This specifies
+ * the snap realm parameters from a given realm and all of its ancestors,
+ * up to the root.
+ *
+ * Caller must hold snap_rwsem for write.
+ */
+int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
+                           void *p, void *e, bool deletion)
+{
+        struct ceph_mds_snap_realm *ri;    /* encoded */
+        __le64 *snaps;                     /* encoded */
+        __le64 *prior_parent_snaps;        /* encoded */
+        struct ceph_snap_realm *realm;
+        int invalidate = 0;
+        int err = -ENOMEM;
+        dout("update_snap_trace deletion=%d\n", deletion);
+more:
+        ceph_decode_need(&p, e, sizeof(*ri), bad);
+        ri = p;
+        p += sizeof(*ri);
+        ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
+                            le32_to_cpu(ri->num_prior_parent_snaps)), bad);
+        snaps = p;
+        p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
+        prior_parent_snaps = p;
+        p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
+        realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
+        if (!realm) {
+                realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
+                if (IS_ERR(realm)) {
+                        err = PTR_ERR(realm);
+                        goto fail;
+                }
+        }
+        if (le64_to_cpu(ri->seq) > realm->seq) {
+                dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+                     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+                /*
+                 * if the realm seq has changed, queue a cap_snap for every
+                 * inode with open caps.  we do this _before_ we update
+                 * the realm info so that we prepare for writeback under the
+                 * _previous_ snap context.
+                 *
+                 * ...unless it's a snap deletion!
+                 */
+                if (!deletion) {
+                        struct ceph_inode_info *ci;
+                        struct inode *lastinode = NULL;
+                        spin_lock(&realm->inodes_with_caps_lock);
+                        list_for_each_entry(ci, &realm->inodes_with_caps,
+                                            i_snap_realm_item) {
+                                struct inode *inode = igrab(&ci->vfs_inode);
+                                if (!inode)
+                                        continue;
+                                spin_unlock(&realm->inodes_with_caps_lock);
+                                if (lastinode)
+                                        iput(lastinode);
+                                lastinode = inode;
+                                ceph_queue_cap_snap(ci);
+                                spin_lock(&realm->inodes_with_caps_lock);
+                        }
+                        spin_unlock(&realm->inodes_with_caps_lock);
+                        if (lastinode)
+                                iput(lastinode);
+                        dout("update_snap_trace cap_snaps queued\n");
+                }
+        } else {
+                dout("update_snap_trace %llx %p seq %lld unchanged\n",
+                     realm->ino, realm, realm->seq);
+        }
+        /* ensure the parent is correct */
+        err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
+        if (err < 0)
+                goto fail;
+        invalidate += err;
+        if (le64_to_cpu(ri->seq) > realm->seq) {
+                /* update realm parameters, snap lists */
+                realm->seq = le64_to_cpu(ri->seq);
+                realm->created = le64_to_cpu(ri->created);
+                realm->parent_since = le64_to_cpu(ri->parent_since);
+                realm->num_snaps = le32_to_cpu(ri->num_snaps);
+                err = dup_array(&realm->snaps, snaps, realm->num_snaps);
+                if (err < 0)
+                        goto fail;
+                realm->num_prior_parent_snaps =
+                        le32_to_cpu(ri->num_prior_parent_snaps);
+                err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
+                                realm->num_prior_parent_snaps);
+                if (err < 0)
+                        goto fail;
+                invalidate = 1;
+        } else if (!realm->cached_context) {
+                invalidate = 1;
+        }
+        dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
+             realm, invalidate, p, e);
+        if (p < e)
+                goto more;
+        /* invalidate when we reach the _end_ (root) of the trace */
+        if (invalidate)
+                rebuild_snap_realms(realm);
+        __cleanup_empty_realms(mdsc);
+        return 0;
+bad:
+        err = -EINVAL;
+fail:
+        pr_err("update_snap_trace error %d\n", err);
+        return err;
+}
+/*
+ * Send any cap_snaps that are queued for flush.  Try to carry
+ * s_mutex across multiple snap flushes to avoid locking overhead.
+ *
+ * Caller holds no locks.
+ */
+static void flush_snaps(struct ceph_mds_client *mdsc)
+{
+        struct ceph_inode_info *ci;
+        struct inode *inode;
+        struct ceph_mds_session *session = NULL;
+        dout("flush_snaps\n");
+        spin_lock(&mdsc->snap_flush_lock);
+        while (!list_empty(&mdsc->snap_flush_list)) {
+                ci = list_first_entry(&mdsc->snap_flush_list,
+                                struct ceph_inode_info, i_snap_flush_item);
+                inode = &ci->vfs_inode;
+                igrab(inode);
+                spin_unlock(&mdsc->snap_flush_lock);
+                spin_lock(&inode->i_lock);
+                __ceph_flush_snaps(ci, &session);
+                spin_unlock(&inode->i_lock);
+                iput(inode);
+                spin_lock(&mdsc->snap_flush_lock);
+        }
+        spin_unlock(&mdsc->snap_flush_lock);
+        if (session) {
+                mutex_unlock(&session->s_mutex);
+                ceph_put_mds_session(session);
+        }
+        dout("flush_snaps done\n");
+}
+/*
+ * Handle a snap notification from the MDS.
+ *
+ * This can take two basic forms: the simplest is just a snap creation
+ * or deletion notification on an existing realm.  This should update the
+ * realm and its children.
+ *
+ * The more difficult case is realm creation, due to snap creation at a
+ * new point in the file hierarchy, or due to a rename that moves a file or
+ * directory into another realm.
+ */
+void ceph_handle_snap(struct ceph_mds_client *mdsc,
+                      struct ceph_mds_session *session,
+                      struct ceph_msg *msg)
+{
+        struct super_block *sb = mdsc->client->sb;
+        int mds = session->s_mds;
+        u64 split;
+        int op;
+        int trace_len;
+        struct ceph_snap_realm *realm = NULL;
+        void *p = msg->front.iov_base;
+        void *e = p + msg->front.iov_len;
+        struct ceph_mds_snap_head *h;
+        int num_split_inos, num_split_realms;
+        __le64 *split_inos = NULL, *split_realms = NULL;
+        int i;
+        int locked_rwsem = 0;
+        /* decode */
+        if (msg->front.iov_len < sizeof(*h))
+                goto bad;
+        h = p;
+        op = le32_to_cpu(h->op);
+        split = le64_to_cpu(h->split);   /* non-zero if we are splitting an
+                                          * existing realm */
+        num_split_inos = le32_to_cpu(h->num_split_inos);
+        num_split_realms = le32_to_cpu(h->num_split_realms);
+        trace_len = le32_to_cpu(h->trace_len);
+        p += sizeof(*h);
+        dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
+             ceph_snap_op_name(op), split, trace_len);
+        mutex_lock(&session->s_mutex);
+        session->s_seq++;
+        mutex_unlock(&session->s_mutex);
+        down_write(&mdsc->snap_rwsem);
+        locked_rwsem = 1;
+        if (op == CEPH_SNAP_OP_SPLIT) {
+                struct ceph_mds_snap_realm *ri;
+                /*
+                 * A "split" breaks part of an existing realm off into
+                 * a new realm.  The MDS provides a list of inodes
+                 * (with caps) and child realms that belong to the new
+                 * child.
+                 */
+                split_inos = p;
+                p += sizeof(u64) * num_split_inos;
+                split_realms = p;
+                p += sizeof(u64) * num_split_realms;
+                ceph_decode_need(&p, e, sizeof(*ri), bad);
+                /* we will peek at realm info here, but will _not_
+                 * advance p, as the realm update will occur below in
+                 * ceph_update_snap_trace. */
+                ri = p;
+                realm = ceph_lookup_snap_realm(mdsc, split);
+                if (!realm) {
+                        realm = ceph_create_snap_realm(mdsc, split);
+                        if (IS_ERR(realm))
+                                goto out;
+                }
+                ceph_get_snap_realm(mdsc, realm);
+                dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+                for (i = 0; i < num_split_inos; i++) {
+                        struct ceph_vino vino = {
+                                .ino = le64_to_cpu(split_inos[i]),
+                                .snap = CEPH_NOSNAP,
+                        };
+                        struct inode *inode = ceph_find_inode(sb, vino);
+                        struct ceph_inode_info *ci;
+                        if (!inode)
+                                continue;
+                        ci = ceph_inode(inode);
+                        spin_lock(&inode->i_lock);
+                        if (!ci->i_snap_realm)
+                                goto skip_inode;
+                        /*
+                         * If this inode belongs to a realm that was
+                         * created after our new realm, we experienced
+                         * a race (due to another split notifications
+                         * arriving from a different MDS).  So skip
+                         * this inode.
+                         */
+                        if (ci->i_snap_realm->created >
+                            le64_to_cpu(ri->created)) {
+                                dout(" leaving %p in newer realm %llx %p\n",
+                                     inode, ci->i_snap_realm->ino,
+                                     ci->i_snap_realm);
+                                goto skip_inode;
+                        }
+                        dout(" will move %p to split realm %llx %p\n",
+                             inode, realm->ino, realm);
+                        /*
+                         * Remove the inode from the realm's inode
+                         * list, but don't add it to the new realm
+                         * yet.  We don't want the cap_snap to be
+                         * queued (again) by ceph_update_snap_trace()
+                         * below.  Queue it _now_, under the old context.
+                         */
+                        spin_lock(&realm->inodes_with_caps_lock);
+                        list_del_init(&ci->i_snap_realm_item);
+                        spin_unlock(&realm->inodes_with_caps_lock);
+                        spin_unlock(&inode->i_lock);
+                        ceph_queue_cap_snap(ci);
+                        iput(inode);
+                        continue;
+skip_inode:
+                        spin_unlock(&inode->i_lock);
+                        iput(inode);
+                }
+                /* we may have taken some of the old realm's children. */
+                for (i = 0; i < num_split_realms; i++) {
+                        struct ceph_snap_realm *child =
+                                ceph_lookup_snap_realm(mdsc,
+                                           le64_to_cpu(split_realms[i]));
+                        if (!child)
+                                continue;
+                        adjust_snap_realm_parent(mdsc, child, realm->ino);
+                }
+        }
+        /*
+         * update using the provided snap trace. if we are deleting a
+         * snap, we can avoid queueing cap_snaps.
+         */
+        ceph_update_snap_trace(mdsc, p, e,
+                               op == CEPH_SNAP_OP_DESTROY);
+        if (op == CEPH_SNAP_OP_SPLIT) {
+                /*
+                 * ok, _now_ add the inodes into the new realm.
+                 */
+                for (i = 0; i < num_split_inos; i++) {
+                        struct ceph_vino vino = {
+                                .ino = le64_to_cpu(split_inos[i]),
+                                .snap = CEPH_NOSNAP,
+                        };
+                        struct inode *inode = ceph_find_inode(sb, vino);
+                        struct ceph_inode_info *ci;
+                        if (!inode)
+                                continue;
+                        ci = ceph_inode(inode);
+                        spin_lock(&inode->i_lock);
+                        if (list_empty(&ci->i_snap_realm_item)) {
+                                struct ceph_snap_realm *oldrealm =
+                                        ci->i_snap_realm;
+                                dout(" moving %p to split realm %llx %p\n",
+                                     inode, realm->ino, realm);
+                                spin_lock(&realm->inodes_with_caps_lock);
+                                list_add(&ci->i_snap_realm_item,
+                                         &realm->inodes_with_caps);
+                                ci->i_snap_realm = realm;
+                                spin_unlock(&realm->inodes_with_caps_lock);
+                                ceph_get_snap_realm(mdsc, realm);
+                                ceph_put_snap_realm(mdsc, oldrealm);
+                        }
+                        spin_unlock(&inode->i_lock);
+                        iput(inode);
+                }
+                /* we took a reference when we created the realm, above */
+                ceph_put_snap_realm(mdsc, realm);
+        }
+        __cleanup_empty_realms(mdsc);
+        up_write(&mdsc->snap_rwsem);
+        flush_snaps(mdsc);
+        return;
+bad:
+        pr_err("corrupt snap message from mds%d\n", mds);
+        ceph_msg_dump(msg);
+out:
+        if (locked_rwsem)
+                up_write(&mdsc->snap_rwsem);
+        return;
+}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..110857ba9269
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1041 @@
+#include "ceph_debug.h"
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include "decode.h"
+#include "super.h"
+#include "mon_client.h"
+#include "auth.h"
+/*
+ * Ceph superblock operations
+ *
+ * Handle the basics of mounting, unmounting.
+ */
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+        const char *e = s + len;
+        while (e != s && *(e-1) != '/')
+                e--;
+        return e;
+}
+/*
+ * super ops
+ */
+static void ceph_put_super(struct super_block *s)
+{
+        struct ceph_client *client = ceph_sb_to_client(s);
+        dout("put_super\n");
+        ceph_mdsc_close_sessions(&client->mdsc);
+        /*
+         * ensure we release the bdi before put_anon_super releases
+         * the device name.
+         */
+        if (s->s_bdi == &client->backing_dev_info) {
+                bdi_unregister(&client->backing_dev_info);
+                s->s_bdi = NULL;
+        }
+        return;
+}
+static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
+        struct ceph_monmap *monmap = client->monc.monmap;
+        struct ceph_statfs st;
+        u64 fsid;
+        int err;
+        dout("statfs\n");
+        err = ceph_monc_do_statfs(&client->monc, &st);
+        if (err < 0)
+                return err;
+        /* fill in kstatfs */
+        buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
+        /*
+         * express utilization in terms of large blocks to avoid
+         * overflow on 32-bit machines.
+         */
+        buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+        buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
+        buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
+                (CEPH_BLOCK_SHIFT-10);
+        buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+        buf->f_files = le64_to_cpu(st.num_objects);
+        buf->f_ffree = -1;
+        buf->f_namelen = PATH_MAX;
+        buf->f_frsize = PAGE_CACHE_SIZE;
+        /* leave fsid little-endian, regardless of host endianness */
+        fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
+        buf->f_fsid.val[0] = fsid & 0xffffffff;
+        buf->f_fsid.val[1] = fsid >> 32;
+        return 0;
+}
+static int ceph_syncfs(struct super_block *sb, int wait)
+{
+        dout("sync_fs %d\n", wait);
+        ceph_osdc_sync(&ceph_client(sb)->osdc);
+        ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+        dout("sync_fs %d done\n", wait);
+        return 0;
+}
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+        struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
+        struct ceph_mount_args *args = client->mount_args;
+        if (args->flags & CEPH_OPT_FSID)
+                seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
+                           le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
+                           le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
+        if (args->flags & CEPH_OPT_NOSHARE)
+                seq_puts(m, ",noshare");
+        if (args->flags & CEPH_OPT_DIRSTAT)
+                seq_puts(m, ",dirstat");
+        if ((args->flags & CEPH_OPT_RBYTES) == 0)
+                seq_puts(m, ",norbytes");
+        if (args->flags & CEPH_OPT_NOCRC)
+                seq_puts(m, ",nocrc");
+        if (args->flags & CEPH_OPT_NOASYNCREADDIR)
+                seq_puts(m, ",noasyncreaddir");
+        if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+                seq_printf(m, ",snapdirname=%s", args->snapdir_name);
+        if (args->name)
+                seq_printf(m, ",name=%s", args->name);
+        if (args->secret)
+                seq_puts(m, ",secret=<hidden>");
+        return 0;
+}
+/*
+ * caches
+ */
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+static void ceph_inode_init_once(void *foo)
+{
+        struct ceph_inode_info *ci = foo;
+        inode_init_once(&ci->vfs_inode);
+}
+static int default_congestion_kb(void)
+{
+        int congestion_kb;
+        /*
+         * Copied from NFS
+         *
+         * congestion size, scale with available memory.
+         *
+         *  64MB:    8192k
+         * 128MB:   11585k
+         * 256MB:   16384k
+         * 512MB:   23170k
+         *   1GB:   32768k
+         *   2GB:   46340k
+         *   4GB:   65536k
+         *   8GB:   92681k
+         *  16GB:  131072k
+         *
+         * This allows larger machines to have larger/more transfers.
+         * Limit the default to 256M
+         */
+        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+        if (congestion_kb > 256*1024)
+                congestion_kb = 256*1024;
+        return congestion_kb;
+}
+static int __init init_caches(void)
+{
+        ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+                                      sizeof(struct ceph_inode_info),
+                                      __alignof__(struct ceph_inode_info),
+                                      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+                                      ceph_inode_init_once);
+        if (ceph_inode_cachep == NULL)
+                return -ENOMEM;
+        ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+                                     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+        if (ceph_cap_cachep == NULL)
+                goto bad_cap;
+        ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+                                        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+        if (ceph_dentry_cachep == NULL)
+                goto bad_dentry;
+        ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+                                      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+        if (ceph_file_cachep == NULL)
+                goto bad_file;
+        return 0;
+bad_file:
+        kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+        kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+        kmem_cache_destroy(ceph_inode_cachep);
+        return -ENOMEM;
+}
+static void destroy_caches(void)
+{
+        kmem_cache_destroy(ceph_inode_cachep);
+        kmem_cache_destroy(ceph_cap_cachep);
+        kmem_cache_destroy(ceph_dentry_cachep);
+        kmem_cache_destroy(ceph_file_cachep);
+}
+/*
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
+ */
+static void ceph_umount_begin(struct super_block *sb)
+{
+        struct ceph_client *client = ceph_sb_to_client(sb);
+        dout("ceph_umount_begin - starting forced umount\n");
+        if (!client)
+                return;
+        client->mount_state = CEPH_MOUNT_SHUTDOWN;
+        return;
+}
+static const struct super_operations ceph_super_ops = {
+        .alloc_inode    = ceph_alloc_inode,
+        .destroy_inode  = ceph_destroy_inode,
+        .write_inode    = ceph_write_inode,
+        .sync_fs        = ceph_syncfs,
+        .put_super      = ceph_put_super,
+        .show_options   = ceph_show_options,
+        .statfs         = ceph_statfs,
+        .umount_begin   = ceph_umount_begin,
+};
+const char *ceph_msg_type_name(int type)
+{
+        switch (type) {
+        case CEPH_MSG_SHUTDOWN: return "shutdown";
+        case CEPH_MSG_PING: return "ping";
+        case CEPH_MSG_AUTH: return "auth";
+        case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+        case CEPH_MSG_MON_MAP: return "mon_map";
+        case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+        case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+        case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+        case CEPH_MSG_STATFS: return "statfs";
+        case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+        case CEPH_MSG_MDS_MAP: return "mds_map";
+        case CEPH_MSG_CLIENT_SESSION: return "client_session";
+        case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+        case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+        case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+        case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+        case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+        case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+        case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+        case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+        case CEPH_MSG_OSD_MAP: return "osd_map";
+        case CEPH_MSG_OSD_OP: return "osd_op";
+        case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+        default: return "unknown";
+        }
+}
+/*
+ * mount options
+ */
+enum {
+        Opt_fsidmajor,
+        Opt_fsidminor,
+        Opt_monport,
+        Opt_wsize,
+        Opt_rsize,
+        Opt_osdtimeout,
+        Opt_osdkeepalivetimeout,
+        Opt_mount_timeout,
+        Opt_osd_idle_ttl,
+        Opt_caps_wanted_delay_min,
+        Opt_caps_wanted_delay_max,
+        Opt_readdir_max_entries,
+        Opt_congestion_kb,
+        Opt_last_int,
+        /* int args above */
+        Opt_snapdirname,
+        Opt_name,
+        Opt_secret,
+        Opt_last_string,
+        /* string args above */
+        Opt_ip,
+        Opt_noshare,
+        Opt_dirstat,
+        Opt_nodirstat,
+        Opt_rbytes,
+        Opt_norbytes,
+        Opt_nocrc,
+        Opt_noasyncreaddir,
+};
+static match_table_t arg_tokens = {
+        {Opt_fsidmajor, "fsidmajor=%ld"},
+        {Opt_fsidminor, "fsidminor=%ld"},
+        {Opt_monport, "monport=%d"},
+        {Opt_wsize, "wsize=%d"},
+        {Opt_rsize, "rsize=%d"},
+        {Opt_osdtimeout, "osdtimeout=%d"},
+        {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
+        {Opt_mount_timeout, "mount_timeout=%d"},
+        {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+        {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
+        {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+        {Opt_readdir_max_entries, "readdir_max_entries=%d"},
+        {Opt_congestion_kb, "write_congestion_kb=%d"},
+        /* int args above */
+        {Opt_snapdirname, "snapdirname=%s"},
+        {Opt_name, "name=%s"},
+        {Opt_secret, "secret=%s"},
+        /* string args above */
+        {Opt_ip, "ip=%s"},
+        {Opt_noshare, "noshare"},
+        {Opt_dirstat, "dirstat"},
+        {Opt_nodirstat, "nodirstat"},
+        {Opt_rbytes, "rbytes"},
+        {Opt_norbytes, "norbytes"},
+        {Opt_nocrc, "nocrc"},
+        {Opt_noasyncreaddir, "noasyncreaddir"},
+        {-1, NULL}
+};
+static struct ceph_mount_args *parse_mount_args(int flags, char *options,
+                                                const char *dev_name,
+                                                const char **path)
+{
+        struct ceph_mount_args *args;
+        const char *c;
+        int err = -ENOMEM;
+        substring_t argstr[MAX_OPT_ARGS];
+        args = kzalloc(sizeof(*args), GFP_KERNEL);
+        if (!args)
+                return ERR_PTR(-ENOMEM);
+        args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
+                                 GFP_KERNEL);
+        if (!args->mon_addr)
+                goto out;
+        dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
+        /* start with defaults */
+        args->sb_flags = flags;
+        args->flags = CEPH_OPT_DEFAULT;
+        args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+        args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+        args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+        args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+        args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+        args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+        args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+        args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+        args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
+        args->max_readdir = 1024;
+        args->congestion_kb = default_congestion_kb();
+        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+        err = -EINVAL;
+        if (!dev_name)
+                goto out;
+        *path = strstr(dev_name, ":/");
+        if (*path == NULL) {
+                pr_err("device name is missing path (no :/ in %s)\n",
+                       dev_name);
+                goto out;
+        }
+        /* get mon ip(s) */
+        err = ceph_parse_ips(dev_name, *path, args->mon_addr,
+                             CEPH_MAX_MON, &args->num_mon);
+        if (err < 0)
+                goto out;
+        /* path on server */
+        *path += 2;
+        dout("server path '%s'\n", *path);
+        /* parse mount options */
+        while ((c = strsep(&options, ",")) != NULL) {
+                int token, intval, ret;
+                if (!*c)
+                        continue;
+                err = -EINVAL;
+                token = match_token((char *)c, arg_tokens, argstr);
+                if (token < 0) {
+                        pr_err("bad mount option at '%s'\n", c);
+                        goto out;
+                }
+                if (token < Opt_last_int) {
+                        ret = match_int(&argstr[0], &intval);
+                        if (ret < 0) {
+                                pr_err("bad mount option arg (not int) "
+                                       "at '%s'\n", c);
+                                continue;
+                        }
+                        dout("got int token %d val %d\n", token, intval);
+                } else if (token > Opt_last_int && token < Opt_last_string) {
+                        dout("got string token %d val %s\n", token,
+                             argstr[0].from);
+                } else {
+                        dout("got token %d\n", token);
+                }
+                switch (token) {
+                case Opt_fsidmajor:
+                        *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
+                        break;
+                case Opt_fsidminor:
+                        *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
+                        break;
+                case Opt_ip:
+                        err = ceph_parse_ips(argstr[0].from,
+                                             argstr[0].to,
+                                             &args->my_addr,
+                                             1, NULL);
+                        if (err < 0)
+                                goto out;
+                        args->flags |= CEPH_OPT_MYIP;
+                        break;
+                case Opt_snapdirname:
+                        kfree(args->snapdir_name);
+                        args->snapdir_name = kstrndup(argstr[0].from,
+                                              argstr[0].to-argstr[0].from,
+                                              GFP_KERNEL);
+                        break;
+                case Opt_name:
+                        args->name = kstrndup(argstr[0].from,
+                                              argstr[0].to-argstr[0].from,
+                                              GFP_KERNEL);
+                        break;
+                case Opt_secret:
+                        args->secret = kstrndup(argstr[0].from,
+                                                argstr[0].to-argstr[0].from,
+                                                GFP_KERNEL);
+                        break;
+                        /* misc */
+                case Opt_wsize:
+                        args->wsize = intval;
+                        break;
+                case Opt_rsize:
+                        args->rsize = intval;
+                        break;
+                case Opt_osdtimeout:
+                        args->osd_timeout = intval;
+                        break;
+                case Opt_osdkeepalivetimeout:
+                        args->osd_keepalive_timeout = intval;
+                        break;
+                case Opt_mount_timeout:
+                        args->mount_timeout = intval;
+                        break;
+                case Opt_caps_wanted_delay_min:
+                        args->caps_wanted_delay_min = intval;
+                        break;
+                case Opt_caps_wanted_delay_max:
+                        args->caps_wanted_delay_max = intval;
+                        break;
+                case Opt_readdir_max_entries:
+                        args->max_readdir = intval;
+                        break;
+                case Opt_congestion_kb:
+                        args->congestion_kb = intval;
+                        break;
+                case Opt_noshare:
+                        args->flags |= CEPH_OPT_NOSHARE;
+                        break;
+                case Opt_dirstat:
+                        args->flags |= CEPH_OPT_DIRSTAT;
+                        break;
+                case Opt_nodirstat:
+                        args->flags &= ~CEPH_OPT_DIRSTAT;
+                        break;
+                case Opt_rbytes:
+                        args->flags |= CEPH_OPT_RBYTES;
+                        break;
+                case Opt_norbytes:
+                        args->flags &= ~CEPH_OPT_RBYTES;
+                        break;
+                case Opt_nocrc:
+                        args->flags |= CEPH_OPT_NOCRC;
+                        break;
+                case Opt_noasyncreaddir:
+                        args->flags |= CEPH_OPT_NOASYNCREADDIR;
+                        break;
+                default:
+                        BUG_ON(token);
+                }
+        }
+        return args;
+out:
+        kfree(args->mon_addr);
+        kfree(args);
+        return ERR_PTR(err);
+}
+static void destroy_mount_args(struct ceph_mount_args *args)
+{
+        dout("destroy_mount_args %p\n", args);
+        kfree(args->snapdir_name);
+        args->snapdir_name = NULL;
+        kfree(args->name);
+        args->name = NULL;
+        kfree(args->secret);
+        args->secret = NULL;
+        kfree(args);
+}
+/*
+ * create a fresh client instance
+ */
+static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+{
+        struct ceph_client *client;
+        int err = -ENOMEM;
+        client = kzalloc(sizeof(*client), GFP_KERNEL);
+        if (client == NULL)
+                return ERR_PTR(-ENOMEM);
+        mutex_init(&client->mount_mutex);
+        init_waitqueue_head(&client->auth_wq);
+        client->sb = NULL;
+        client->mount_state = CEPH_MOUNT_MOUNTING;
+        client->mount_args = args;
+        client->msgr = NULL;
+        client->auth_err = 0;
+        atomic_long_set(&client->writeback_count, 0);
+        err = bdi_init(&client->backing_dev_info);
+        if (err < 0)
+                goto fail;
+        err = -ENOMEM;
+        client->wb_wq = create_workqueue("ceph-writeback");
+        if (client->wb_wq == NULL)
+                goto fail_bdi;
+        client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+        if (client->pg_inv_wq == NULL)
+                goto fail_wb_wq;
+        client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+        if (client->trunc_wq == NULL)
+                goto fail_pg_inv_wq;
+        /* set up mempools */
+        err = -ENOMEM;
+        client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+                              client->mount_args->wsize >> PAGE_CACHE_SHIFT);
+        if (!client->wb_pagevec_pool)
+                goto fail_trunc_wq;
+        /* caps */
+        client->min_caps = args->max_readdir;
+        ceph_adjust_min_caps(client->min_caps);
+        /* subsystems */
+        err = ceph_monc_init(&client->monc, client);
+        if (err < 0)
+                goto fail_mempool;
+        err = ceph_osdc_init(&client->osdc, client);
+        if (err < 0)
+                goto fail_monc;
+        err = ceph_mdsc_init(&client->mdsc, client);
+        if (err < 0)
+                goto fail_osdc;
+        return client;
+fail_osdc:
+        ceph_osdc_stop(&client->osdc);
+fail_monc:
+        ceph_monc_stop(&client->monc);
+fail_mempool:
+        mempool_destroy(client->wb_pagevec_pool);
+fail_trunc_wq:
+        destroy_workqueue(client->trunc_wq);
+fail_pg_inv_wq:
+        destroy_workqueue(client->pg_inv_wq);
+fail_wb_wq:
+        destroy_workqueue(client->wb_wq);
+fail_bdi:
+        bdi_destroy(&client->backing_dev_info);
+fail:
+        kfree(client);
+        return ERR_PTR(err);
+}
+static void ceph_destroy_client(struct ceph_client *client)
+{
+        dout("destroy_client %p\n", client);
+        /* unmount */
+        ceph_mdsc_stop(&client->mdsc);
+        ceph_monc_stop(&client->monc);
+        ceph_osdc_stop(&client->osdc);
+        ceph_adjust_min_caps(-client->min_caps);
+        ceph_debugfs_client_cleanup(client);
+        destroy_workqueue(client->wb_wq);
+        destroy_workqueue(client->pg_inv_wq);
+        destroy_workqueue(client->trunc_wq);
+        bdi_destroy(&client->backing_dev_info);
+        if (client->msgr)
+                ceph_messenger_destroy(client->msgr);
+        mempool_destroy(client->wb_pagevec_pool);
+        destroy_mount_args(client->mount_args);
+        kfree(client);
+        dout("destroy_client %p done\n", client);
+}
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+        if (client->have_fsid) {
+                if (ceph_fsid_compare(&client->fsid, fsid)) {
+                        pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
+                               PR_FSID(&client->fsid), PR_FSID(fsid));
+                        return -1;
+                }
+        } else {
+                pr_info("client%lld fsid " FSID_FORMAT "\n",
+                        client->monc.auth->global_id, PR_FSID(fsid));
+                memcpy(&client->fsid, fsid, sizeof(*fsid));
+                ceph_debugfs_client_init(client);
+                client->have_fsid = true;
+        }
+        return 0;
+}
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_map(struct ceph_client *client)
+{
+        return client->monc.monmap && client->monc.monmap->epoch;
+}
+/*
+ * Bootstrap mount by opening the root directory.  Note the mount
+ * @started time from caller, and time out if this takes too long.
+ */
+static struct dentry *open_root_dentry(struct ceph_client *client,
+                                       const char *path,
+                                       unsigned long started)
+{
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req = NULL;
+        int err;
+        struct dentry *root;
+        /* open dir */
+        dout("open_root_inode opening '%s'\n", path);
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+        if (IS_ERR(req))
+                return ERR_PTR(PTR_ERR(req));
+        req->r_path1 = kstrdup(path, GFP_NOFS);
+        req->r_ino1.ino = CEPH_INO_ROOT;
+        req->r_ino1.snap = CEPH_NOSNAP;
+        req->r_started = started;
+        req->r_timeout = client->mount_args->mount_timeout * HZ;
+        req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+        req->r_num_caps = 2;
+        err = ceph_mdsc_do_request(mdsc, NULL, req);
+        if (err == 0) {
+                dout("open_root_inode success\n");
+                if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
+                    client->sb->s_root == NULL)
+                        root = d_alloc_root(req->r_target_inode);
+                else
+                        root = d_obtain_alias(req->r_target_inode);
+                req->r_target_inode = NULL;
+                dout("open_root_inode success, root dentry is %p\n", root);
+        } else {
+                root = ERR_PTR(err);
+        }
+        ceph_mdsc_put_request(req);
+        return root;
+}
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+                      const char *path)
+{
+        struct ceph_entity_addr *myaddr = NULL;
+        int err;
+        unsigned long timeout = client->mount_args->mount_timeout * HZ;
+        unsigned long started = jiffies;  /* note the start time */
+        struct dentry *root;
+        dout("mount start\n");
+        mutex_lock(&client->mount_mutex);
+        /* initialize the messenger */
+        if (client->msgr == NULL) {
+                if (ceph_test_opt(client, MYIP))
+                        myaddr = &client->mount_args->my_addr;
+                client->msgr = ceph_messenger_create(myaddr);
+                if (IS_ERR(client->msgr)) {
+                        err = PTR_ERR(client->msgr);
+                        client->msgr = NULL;
+                        goto out;
+                }
+                client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+        }
+        /* open session, and wait for mon, mds, and osd maps */
+        err = ceph_monc_open_session(&client->monc);
+        if (err < 0)
+                goto out;
+        while (!have_mon_map(client)) {
+                err = -EIO;
+                if (timeout && time_after_eq(jiffies, started + timeout))
+                        goto out;
+                /* wait */
+                dout("mount waiting for mon_map\n");
+                err = wait_event_interruptible_timeout(client->auth_wq,
+                               have_mon_map(client) || (client->auth_err < 0),
+                               timeout);
+                if (err == -EINTR || err == -ERESTARTSYS)
+                        goto out;
+                if (client->auth_err < 0) {
+                        err = client->auth_err;
+                        goto out;
+                }
+        }
+        dout("mount opening root\n");
+        root = open_root_dentry(client, "", started);
+        if (IS_ERR(root)) {
+                err = PTR_ERR(root);
+                goto out;
+        }
+        if (client->sb->s_root)
+                dput(root);
+        else
+                client->sb->s_root = root;
+        if (path[0] == 0) {
+                dget(root);
+        } else {
+                dout("mount opening base mountpoint\n");
+                root = open_root_dentry(client, path, started);
+                if (IS_ERR(root)) {
+                        err = PTR_ERR(root);
+                        dput(client->sb->s_root);
+                        client->sb->s_root = NULL;
+                        goto out;
+                }
+        }
+        mnt->mnt_root = root;
+        mnt->mnt_sb = client->sb;
+        client->mount_state = CEPH_MOUNT_MOUNTED;
+        dout("mount success\n");
+        err = 0;
+out:
+        mutex_unlock(&client->mount_mutex);
+        return err;
+}
+static int ceph_set_super(struct super_block *s, void *data)
+{
+        struct ceph_client *client = data;
+        int ret;
+        dout("set_super %p data %p\n", s, data);
+        s->s_flags = client->mount_args->sb_flags;
+        s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
+        s->s_fs_info = client;
+        client->sb = s;
+        s->s_op = &ceph_super_ops;
+        s->s_export_op = &ceph_export_ops;
+        s->s_time_gran = 1000;  /* 1000 ns == 1 us */
+        ret = set_anon_super(s, NULL);  /* what is that second arg for? */
+        if (ret != 0)
+                goto fail;
+        return ret;
+fail:
+        s->s_fs_info = NULL;
+        client->sb = NULL;
+        return ret;
+}
+/*
+ * share superblock if same fs AND options
+ */
+static int ceph_compare_super(struct super_block *sb, void *data)
+{
+        struct ceph_client *new = data;
+        struct ceph_mount_args *args = new->mount_args;
+        struct ceph_client *other = ceph_sb_to_client(sb);
+        int i;
+        dout("ceph_compare_super %p\n", sb);
+        if (args->flags & CEPH_OPT_FSID) {
+                if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
+                        dout("fsid doesn't match\n");
+                        return 0;
+                }
+        } else {
+                /* do we share (a) monitor? */
+                for (i = 0; i < new->monc.monmap->num_mon; i++)
+                        if (ceph_monmap_contains(other->monc.monmap,
+                                         &new->monc.monmap->mon_inst[i].addr))
+                                break;
+                if (i == new->monc.monmap->num_mon) {
+                        dout("mon ip not part of monmap\n");
+                        return 0;
+                }
+                dout("mon ip matches existing sb %p\n", sb);
+        }
+        if (args->sb_flags != other->mount_args->sb_flags) {
+                dout("flags differ\n");
+                return 0;
+        }
+        return 1;
+}
+/*
+ * construct our own bdi so we can control readahead, etc.
+ */
+static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+{
+        int err;
+        /* set ra_pages based on rsize mount option? */
+        if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
+                client->backing_dev_info.ra_pages =
+                        (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+                        >> PAGE_SHIFT;
+        err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+        if (!err)
+                sb->s_bdi = &client->backing_dev_info;
+        return err;
+}
+static int ceph_get_sb(struct file_system_type *fs_type,
+                       int flags, const char *dev_name, void *data,
+                       struct vfsmount *mnt)
+{
+        struct super_block *sb;
+        struct ceph_client *client;
+        int err;
+        int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
+        const char *path = NULL;
+        struct ceph_mount_args *args;
+        dout("ceph_get_sb\n");
+        args = parse_mount_args(flags, data, dev_name, &path);
+        if (IS_ERR(args)) {
+                err = PTR_ERR(args);
+                goto out_final;
+        }
+        /* create client (which we may/may not use) */
+        client = ceph_create_client(args);
+        if (IS_ERR(client)) {
+                err = PTR_ERR(client);
+                goto out_final;
+        }
+        if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+                compare_super = NULL;
+        sb = sget(fs_type, compare_super, ceph_set_super, client);
+        if (IS_ERR(sb)) {
+                err = PTR_ERR(sb);
+                goto out;
+        }
+        if (ceph_client(sb) != client) {
+                ceph_destroy_client(client);
+                client = ceph_client(sb);
+                dout("get_sb got existing client %p\n", client);
+        } else {
+                dout("get_sb using new client %p\n", client);
+                err = ceph_register_bdi(sb, client);
+                if (err < 0)
+                        goto out_splat;
+        }
+        err = ceph_mount(client, mnt, path);
+        if (err < 0)
+                goto out_splat;
+        dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
+             mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
+        return 0;
+out_splat:
+        ceph_mdsc_close_sessions(&client->mdsc);
+        up_write(&sb->s_umount);
+        deactivate_super(sb);
+        goto out_final;
+out:
+        ceph_destroy_client(client);
+out_final:
+        dout("ceph_get_sb fail %d\n", err);
+        return err;
+}
+static void ceph_kill_sb(struct super_block *s)
+{
+        struct ceph_client *client = ceph_sb_to_client(s);
+        dout("kill_sb %p\n", s);
+        ceph_mdsc_pre_umount(&client->mdsc);
+        kill_anon_super(s);    /* will call put_super after sb is r/o */
+        ceph_destroy_client(client);
+}
+static struct file_system_type ceph_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ceph",
+        .get_sb         = ceph_get_sb,
+        .kill_sb        = ceph_kill_sb,
+        .fs_flags       = FS_RENAME_DOES_D_MOVE,
+};
+#define _STRINGIFY(x) #x
+#define STRINGIFY(x) _STRINGIFY(x)
+static int __init init_ceph(void)
+{
+        int ret = 0;
+        ret = ceph_debugfs_init();
+        if (ret < 0)
+                goto out;
+        ret = ceph_msgr_init();
+        if (ret < 0)
+                goto out_debugfs;
+        ret = init_caches();
+        if (ret)
+                goto out_msgr;
+        ceph_caps_init();
+        ret = register_filesystem(&ceph_fs_type);
+        if (ret)
+                goto out_icache;
+        pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
+                CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
+                CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
+                CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+        return 0;
+out_icache:
+        destroy_caches();
+out_msgr:
+        ceph_msgr_exit();
+out_debugfs:
+        ceph_debugfs_cleanup();
+out:
+        return ret;
+}
+static void __exit exit_ceph(void)
+{
+        dout("exit_ceph\n");
+        unregister_filesystem(&ceph_fs_type);
+        ceph_caps_finalize();
+        destroy_caches();
+        ceph_msgr_exit();
+        ceph_debugfs_cleanup();
+}
+module_init(init_ceph);
+module_exit(exit_ceph);
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..13513b80d87f
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,902 @@
+#ifndef _FS_CEPH_SUPER_H
+#define _FS_CEPH_SUPER_H
+#include "ceph_debug.h"
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+#include "types.h"
+#include "messenger.h"
+#include "msgpool.h"
+#include "mon_client.h"
+#include "mds_client.h"
+#include "osd_client.h"
+#include "ceph_fs.h"
+/* f_type in struct statfs */
+#define CEPH_SUPER_MAGIC 0x00c36400
+/* large granularity for statfs utilization stats to facilitate
+ * large volume sizes on 32-bit machines. */
+#define CEPH_BLOCK_SHIFT   20  /* 1 MB */
+#define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
+/*
+ * mount options
+ */
+#define CEPH_OPT_FSID             (1<<0)
+#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
+#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
+#define CEPH_OPT_DIRSTAT          (1<<4) /* funky `cat dirname` for stats */
+#define CEPH_OPT_RBYTES           (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_OPT_NOCRC            (1<<6) /* no data crc on writes */
+#define CEPH_OPT_NOASYNCREADDIR   (1<<7) /* no dcache readdir */
+#define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
+#define ceph_set_opt(client, opt) \
+        (client)->mount_args->flags |= CEPH_OPT_##opt;
+#define ceph_test_opt(client, opt) \
+        (!!((client)->mount_args->flags & CEPH_OPT_##opt))
+struct ceph_mount_args {
+        int sb_flags;
+        int num_mon;
+        struct ceph_entity_addr *mon_addr;
+        int flags;
+        int mount_timeout;
+        int osd_idle_ttl;
+        int caps_wanted_delay_min, caps_wanted_delay_max;
+        struct ceph_fsid fsid;
+        struct ceph_entity_addr my_addr;
+        int wsize;
+        int rsize;            /* max readahead */
+        int max_readdir;      /* max readdir size */
+        int congestion_kb;      /* max readdir size */
+        int osd_timeout;
+        int osd_keepalive_timeout;
+        char *snapdir_name;   /* default ".snap" */
+        char *name;
+        char *secret;
+        int cap_release_safety;
+};
+/*
+ * defaults
+ */
+#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
+#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
+#define CEPH_OSD_KEEPALIVE_DEFAULT  5
+#define CEPH_OSD_IDLE_TTL_DEFAULT    60
+#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+#define CEPH_MSG_MAX_FRONT_LEN  (16*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN   (16*1024*1024)
+#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
+#define CEPH_AUTH_NAME_DEFAULT   "guest"
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file.  Delay a minimum amount of time, even if we send a cap
+ * message for some other reason.  Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+/* mount state */
+enum {
+        CEPH_MOUNT_MOUNTING,
+        CEPH_MOUNT_MOUNTED,
+        CEPH_MOUNT_UNMOUNTING,
+        CEPH_MOUNT_UNMOUNTED,
+        CEPH_MOUNT_SHUTDOWN,
+};
+/*
+ * subtract jiffies
+ */
+static inline unsigned long time_sub(unsigned long a, unsigned long b)
+{
+        BUG_ON(time_after(b, a));
+        return (long)a - (long)b;
+}
+/*
+ * per-filesystem client state
+ *
+ * possibly shared by multiple mount points, if they are
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_client {
+        struct ceph_fsid fsid;
+        bool have_fsid;
+        struct mutex mount_mutex;       /* serialize mount attempts */
+        struct ceph_mount_args *mount_args;
+        struct super_block *sb;
+        unsigned long mount_state;
+        wait_queue_head_t auth_wq;
+        int auth_err;
+        int min_caps;                  /* min caps i added */
+        struct ceph_messenger *msgr;   /* messenger instance */
+        struct ceph_mon_client monc;
+        struct ceph_mds_client mdsc;
+        struct ceph_osd_client osdc;
+        /* writeback */
+        mempool_t *wb_pagevec_pool;
+        struct workqueue_struct *wb_wq;
+        struct workqueue_struct *pg_inv_wq;
+        struct workqueue_struct *trunc_wq;
+        atomic_long_t writeback_count;
+        struct backing_dev_info backing_dev_info;
+#ifdef CONFIG_DEBUG_FS
+        struct dentry *debugfs_monmap;
+        struct dentry *debugfs_mdsmap, *debugfs_osdmap;
+        struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+        struct dentry *debugfs_congestion_kb;
+        struct dentry *debugfs_bdi;
+#endif
+};
+static inline struct ceph_client *ceph_client(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+/*
+ * File i/o capability.  This tracks shared state with the metadata
+ * server that allows us to cache or writeback attributes or to read
+ * and write data.  For any given inode, we should have one or more
+ * capabilities, one issued by each metadata server, and our
+ * cumulative access is the OR of all issued capabilities.
+ *
+ * Each cap is referenced by the inode's i_caps rbtree and by per-mds
+ * session capability lists.
+ */
+struct ceph_cap {
+        struct ceph_inode_info *ci;
+        struct rb_node ci_node;          /* per-ci cap tree */
+        struct ceph_mds_session *session;
+        struct list_head session_caps;   /* per-session caplist */
+        int mds;
+        u64 cap_id;       /* unique cap id (mds provided) */
+        int issued;       /* latest, from the mds */
+        int implemented;  /* implemented superset of issued (for revocation) */
+        int mds_wanted;
+        u32 seq, issue_seq, mseq;
+        u32 cap_gen;      /* active/stale cycle */
+        unsigned long last_used;
+        struct list_head caps_item;
+};
+#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
+#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
+#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
+/*
+ * Snapped cap state that is pending flush to mds.  When a snapshot occurs,
+ * we first complete any in-process sync writes and writeback any dirty
+ * data before flushing the snapped state (tracked here) back to the MDS.
+ */
+struct ceph_cap_snap {
+        atomic_t nref;
+        struct ceph_inode_info *ci;
+        struct list_head ci_item, flushing_item;
+        u64 follows, flush_tid;
+        int issued, dirty;
+        struct ceph_snap_context *context;
+        mode_t mode;
+        uid_t uid;
+        gid_t gid;
+        void *xattr_blob;
+        int xattr_len;
+        u64 xattr_version;
+        u64 size;
+        struct timespec mtime, atime, ctime;
+        u64 time_warp_seq;
+        int writing;   /* a sync write is still in progress */
+        int dirty_pages;     /* dirty pages awaiting writeback */
+};
+static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
+{
+        if (atomic_dec_and_test(&capsnap->nref))
+                kfree(capsnap);
+}
+/*
+ * The frag tree describes how a directory is fragmented, potentially across
+ * multiple metadata servers.  It is also used to indicate points where
+ * metadata authority is delegated, and whether/where metadata is replicated.
+ *
+ * A _leaf_ frag will be present in the i_fragtree IFF there is
+ * delegation info.  That is, if mds >= 0 || ndist > 0.
+ */
+#define CEPH_MAX_DIRFRAG_REP 4
+struct ceph_inode_frag {
+        struct rb_node node;
+        /* fragtree state */
+        u32 frag;
+        int split_by;         /* i.e. 2^(split_by) children */
+        /* delegation and replication info */
+        int mds;              /* -1 if same authority as parent */
+        int ndist;            /* >0 if replicated */
+        int dist[CEPH_MAX_DIRFRAG_REP];
+};
+/*
+ * We cache inode xattrs as an encoded blob until they are first used,
+ * at which point we parse them into an rbtree.
+ */
+struct ceph_inode_xattr {
+        struct rb_node node;
+        const char *name;
+        int name_len;
+        const char *val;
+        int val_len;
+        int dirty;
+        int should_free_name;
+        int should_free_val;
+};
+struct ceph_inode_xattrs_info {
+        /*
+         * (still encoded) xattr blob. we avoid the overhead of parsing
+         * this until someone actually calls getxattr, etc.
+         *
+         * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
+         * NULL means we don't know.
+        */
+        struct ceph_buffer *blob, *prealloc_blob;
+        struct rb_root index;
+        bool dirty;
+        int count;
+        int names_size;
+        int vals_size;
+        u64 version, index_version;
+};
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
+#define CEPH_I_NODELAY   4  /* do not delay cap release */
+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
+struct ceph_inode_info {
+        struct ceph_vino i_vino;   /* ceph ino + snap */
+        u64 i_version;
+        u32 i_time_warp_seq;
+        unsigned i_ceph_flags;
+        unsigned long i_release_count;
+        struct ceph_file_layout i_layout;
+        char *i_symlink;
+        /* for dirs */
+        struct timespec i_rctime;
+        u64 i_rbytes, i_rfiles, i_rsubdirs;
+        u64 i_files, i_subdirs;
+        u64 i_max_offset;  /* largest readdir offset, set with I_COMPLETE */
+        struct rb_root i_fragtree;
+        struct mutex i_fragtree_mutex;
+        struct ceph_inode_xattrs_info i_xattrs;
+        /* capabilities.  protected _both_ by i_lock and cap->session's
+         * s_mutex. */
+        struct rb_root i_caps;           /* cap list */
+        struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
+        unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
+        struct list_head i_dirty_item, i_flushing_item;
+        u64 i_cap_flush_seq;
+        /* we need to track cap writeback on a per-cap-bit basis, to allow
+         * overlapping, pipelined cap flushes to the mds.  we can probably
+         * reduce the tid to 8 bits if we're concerned about inode size. */
+        u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
+        wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
+        unsigned long i_hold_caps_min; /* jiffies */
+        unsigned long i_hold_caps_max; /* jiffies */
+        struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
+        int i_cap_exporting_mds;         /* to handle cap migration between */
+        unsigned i_cap_exporting_mseq;   /*  mds's. */
+        unsigned i_cap_exporting_issued;
+        struct ceph_cap_reservation i_cap_migration_resv;
+        struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
+        struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 */
+        unsigned i_snap_caps;           /* cap bits for snapped files */
+        int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
+        u32 i_truncate_seq;        /* last truncate to smaller size */
+        u64 i_truncate_size;       /*  and the size we last truncated down to */
+        int i_truncate_pending;    /*  still need to call vmtruncate */
+        u64 i_max_size;            /* max file size authorized by mds */
+        u64 i_reported_size; /* (max_)size reported to or requested of mds */
+        u64 i_wanted_max_size;     /* offset we'd like to write too */
+        u64 i_requested_max_size;  /* max_size we've requested */
+        /* held references to caps */
+        int i_pin_ref;
+        int i_rd_ref, i_rdcache_ref, i_wr_ref;
+        int i_wrbuffer_ref, i_wrbuffer_ref_head;
+        u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
+        u32 i_rdcache_gen;      /* we increment this each time we get
+                                   FILE_CACHE.  If it's non-zero, we
+                                   _may_ have cached pages. */
+        u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
+        struct list_head i_unsafe_writes; /* uncommitted sync writes */
+        struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+        spinlock_t i_unsafe_lock;
+        struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+        int i_snap_realm_counter; /* snap realm (if caps) */
+        struct list_head i_snap_realm_item;
+        struct list_head i_snap_flush_item;
+        struct work_struct i_wb_work;  /* writeback work */
+        struct work_struct i_pg_inv_work;  /* page invalidation work */
+        struct work_struct i_vmtruncate_work;
+        struct inode vfs_inode; /* at end */
+};
+static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
+{
+        return container_of(inode, struct ceph_inode_info, vfs_inode);
+}
+static inline void ceph_i_clear(struct inode *inode, unsigned mask)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        spin_lock(&inode->i_lock);
+        ci->i_ceph_flags &= ~mask;
+        spin_unlock(&inode->i_lock);
+}
+static inline void ceph_i_set(struct inode *inode, unsigned mask)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        spin_lock(&inode->i_lock);
+        ci->i_ceph_flags |= mask;
+        spin_unlock(&inode->i_lock);
+}
+static inline bool ceph_i_test(struct inode *inode, unsigned mask)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        bool r;
+        smp_mb();
+        r = (ci->i_ceph_flags & mask) == mask;
+        return r;
+}
+/* find a specific frag @f */
+extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
+                                                u32 f);
+/*
+ * choose fragment for value @v.  copy frag content to pfrag, if leaf
+ * exists
+ */
+extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+                            struct ceph_inode_frag *pfrag,
+                            int *found);
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+        struct ceph_mds_session *lease_session;
+        u32 lease_gen, lease_shared_gen;
+        u32 lease_seq;
+        unsigned long lease_renew_after, lease_renew_from;
+        struct list_head lru;
+        struct dentry *dentry;
+        u64 time;
+        u64 offset;
+};
+static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+{
+        return (struct ceph_dentry_info *)dentry->d_fsdata;
+}
+static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
+{
+        return ((loff_t)frag << 32) | (loff_t)off;
+}
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+        ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+        ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+        if (!ino)
+                ino = 1;
+#endif
+        return ino;
+}
+static inline int ceph_set_ino_cb(struct inode *inode, void *data)
+{
+        ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
+        inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+        return 0;
+}
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+        return ceph_inode(inode)->i_vino;
+}
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+static inline u64 ceph_ino(struct inode *inode)
+{
+        return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+        return ceph_inode(inode)->i_vino.snap;
+}
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+        struct ceph_vino *pvino = (struct ceph_vino *)data;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        return ci->i_vino.ino == pvino->ino &&
+                ci->i_vino.snap == pvino->snap;
+}
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+                                            struct ceph_vino vino)
+{
+        ino_t t = ceph_vino_to_ino(vino);
+        return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+/*
+ * caps helpers
+ */
+static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
+{
+        return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
+extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
+extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
+                                    struct ceph_cap *cap);
+static inline int ceph_caps_issued(struct ceph_inode_info *ci)
+{
+        int issued;
+        spin_lock(&ci->vfs_inode.i_lock);
+        issued = __ceph_caps_issued(ci, NULL);
+        spin_unlock(&ci->vfs_inode.i_lock);
+        return issued;
+}
+static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
+                                        int touch)
+{
+        int r;
+        spin_lock(&ci->vfs_inode.i_lock);
+        r = __ceph_caps_issued_mask(ci, mask, touch);
+        spin_unlock(&ci->vfs_inode.i_lock);
+        return r;
+}
+static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
+{
+        return ci->i_dirty_caps | ci->i_flushing_caps;
+}
+extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_used(struct ceph_inode_info *ci);
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+        int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+        if (w & CEPH_CAP_FILE_BUFFER)
+                w |= CEPH_CAP_FILE_EXCL;  /* we want EXCL if dirty data */
+        return w;
+}
+/* what the mds thinks we want */
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+extern void ceph_caps_init(void);
+extern void ceph_caps_finalize(void);
+extern void ceph_adjust_min_caps(int delta);
+extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
+extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
+extern void ceph_reservation_status(struct ceph_client *client,
+                                    int *total, int *avail, int *used,
+                                    int *reserved, int *min);
+static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+{
+        return (struct ceph_client *)inode->i_sb->s_fs_info;
+}
+static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+{
+        return (struct ceph_client *)sb->s_fs_info;
+}
+/*
+ * we keep buffered readdir results attached to file->private_data
+ */
+struct ceph_file_info {
+        int fmode;     /* initialized on open */
+        /* readdir: position within the dir */
+        u32 frag;
+        struct ceph_mds_request *last_readdir;
+        int at_end;
+        /* readdir: position within a frag */
+        unsigned offset;       /* offset of last chunk, adjusted for . and .. */
+        u64 next_offset;       /* offset of next chunk (last_name's + 1) */
+        char *last_name;       /* last entry in previous chunk */
+        struct dentry *dentry; /* next dentry (for dcache readdir) */
+        unsigned long dir_release_count;
+        /* used for -o dirstat read() on directory thing */
+        char *dir_info;
+        int dir_info_len;
+};
+/*
+ * snapshots
+ */
+/*
+ * A "snap context" is the set of existing snapshots when we
+ * write data.  It is used by the OSD to guide its COW behavior.
+ *
+ * The ceph_snap_context is refcounted, and attached to each dirty
+ * page, indicating which context the dirty data belonged when it was
+ * dirtied.
+ */
+struct ceph_snap_context {
+        atomic_t nref;
+        u64 seq;
+        int num_snaps;
+        u64 snaps[];
+};
+static inline struct ceph_snap_context *
+ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+        /*
+        printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+               atomic_read(&sc->nref)+1);
+        */
+        if (sc)
+                atomic_inc(&sc->nref);
+        return sc;
+}
+static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+        if (!sc)
+                return;
+        /*
+        printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+               atomic_read(&sc->nref)-1);
+        */
+        if (atomic_dec_and_test(&sc->nref)) {
+                /*printk(" deleting snap_context %p\n", sc);*/
+                kfree(sc);
+        }
+}
+/*
+ * A "snap realm" describes a subset of the file hierarchy sharing
+ * the same set of snapshots that apply to it.  The realms themselves
+ * are organized into a hierarchy, such that children inherit (some of)
+ * the snapshots of their parents.
+ *
+ * All inodes within the realm that have capabilities are linked into a
+ * per-realm list.
+ */
+struct ceph_snap_realm {
+        u64 ino;
+        atomic_t nref;
+        struct rb_node node;
+        u64 created, seq;
+        u64 parent_ino;
+        u64 parent_since;   /* snapid when our current parent became so */
+        u64 *prior_parent_snaps;      /* snaps inherited from any parents we */
+        int num_prior_parent_snaps;   /*  had prior to parent_since */
+        u64 *snaps;                   /* snaps specific to this realm */
+        int num_snaps;
+        struct ceph_snap_realm *parent;
+        struct list_head children;       /* list of child realms */
+        struct list_head child_item;
+        struct list_head empty_item;     /* if i have ref==0 */
+        /* the current set of snaps for this realm */
+        struct ceph_snap_context *cached_context;
+        struct list_head inodes_with_caps;
+        spinlock_t inodes_with_caps_lock;
+};
+/*
+ * calculate the number of pages a given length and offset map onto,
+ * if we align the data.
+ */
+static inline int calc_pages_for(u64 off, u64 len)
+{
+        return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+                (off >> PAGE_CACHE_SHIFT);
+}
+/* snap.c */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+                                               u64 ino);
+extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+                                struct ceph_snap_realm *realm);
+extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+                                struct ceph_snap_realm *realm);
+extern int ceph_update_snap_trace(struct ceph_mds_client *m,
+                                  void *p, void *e, bool deletion);
+extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
+                             struct ceph_mds_session *session,
+                             struct ceph_msg *msg);
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
+extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+                                  struct ceph_cap_snap *capsnap);
+extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+/*
+ * a cap_snap is "pending" if it is still awaiting an in-progress
+ * sync write (that may/may not still update size, mtime, etc.).
+ */
+static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
+{
+        return !list_empty(&ci->i_cap_snaps) &&
+                list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
+                           ci_item)->writing;
+}
+/* super.c */
+extern struct kmem_cache *ceph_inode_cachep;
+extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_dentry_cachep;
+extern struct kmem_cache *ceph_file_cachep;
+extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
+#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
+        "%02x%02x%02x%02x%02x%02x"
+#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
+                (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7],    \
+                (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11],  \
+                (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
+/* inode.c */
+extern const struct inode_operations ceph_file_iops;
+extern struct inode *ceph_alloc_inode(struct super_block *sb);
+extern void ceph_destroy_inode(struct inode *inode);
+extern struct inode *ceph_get_inode(struct super_block *sb,
+                                    struct ceph_vino vino);
+extern struct inode *ceph_get_snapdir(struct inode *parent);
+extern int ceph_fill_file_size(struct inode *inode, int issued,
+                               u32 truncate_seq, u64 truncate_size, u64 size);
+extern void ceph_fill_file_time(struct inode *inode, int issued,
+                                u64 time_warp_seq, struct timespec *ctime,
+                                struct timespec *mtime, struct timespec *atime);
+extern int ceph_fill_trace(struct super_block *sb,
+                           struct ceph_mds_request *req,
+                           struct ceph_mds_session *session);
+extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+                                    struct ceph_mds_session *session);
+extern int ceph_inode_holds_cap(struct inode *inode, int mask);
+extern int ceph_inode_set_size(struct inode *inode, loff_t size);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
+extern void ceph_queue_vmtruncate(struct inode *inode);
+extern void ceph_queue_invalidate(struct inode *inode);
+extern void ceph_queue_writeback(struct inode *inode);
+extern int ceph_do_getattr(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                        struct kstat *stat);
+/* xattr.c */
+extern int ceph_setxattr(struct dentry *, const char *, const void *,
+                         size_t, int);
+extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
+extern int ceph_removexattr(struct dentry *, const char *);
+extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+/* caps.c */
+extern const char *ceph_cap_string(int c);
+extern void ceph_handle_caps(struct ceph_mds_session *session,
+                             struct ceph_msg *msg);
+extern int ceph_add_cap(struct inode *inode,
+                        struct ceph_mds_session *session, u64 cap_id,
+                        int fmode, unsigned issued, unsigned wanted,
+                        unsigned cap, unsigned seq, u64 realmino, int flags,
+                        struct ceph_cap_reservation *caps_reservation);
+extern void __ceph_remove_cap(struct ceph_cap *cap);
+static inline void ceph_remove_cap(struct ceph_cap *cap)
+{
+        struct inode *inode = &cap->ci->vfs_inode;
+        spin_lock(&inode->i_lock);
+        __ceph_remove_cap(cap);
+        spin_unlock(&inode->i_lock);
+}
+extern void ceph_put_cap(struct ceph_cap *cap);
+extern void ceph_queue_caps_release(struct inode *inode);
+extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
+extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+                                    struct ceph_mds_session *session);
+extern int ceph_get_cap_mds(struct inode *inode);
+extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
+extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+                                       struct ceph_snap_context *snapc);
+extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
+                               struct ceph_mds_session **psession);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+                            struct ceph_mds_session *session);
+extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
+extern int ceph_encode_inode_release(void **p, struct inode *inode,
+                                     int mds, int drop, int unless, int force);
+extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+                                      int mds, int drop, int unless);
+extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+                         int *got, loff_t endoff);
+/* for counting open files by mode */
+static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
+{
+        ci->i_nr_by_mode[mode]++;
+}
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+/* addr.c */
+extern const struct address_space_operations ceph_aops;
+extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+/* file.c */
+extern const struct file_operations ceph_file_fops;
+extern const struct address_space_operations ceph_aops;
+extern int ceph_open(struct inode *inode, struct file *file);
+extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+                                       struct nameidata *nd, int mode,
+                                       int locked_dir);
+extern int ceph_release(struct inode *inode, struct file *filp);
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+/* dir.c */
+extern const struct file_operations ceph_dir_fops;
+extern const struct inode_operations ceph_dir_iops;
+extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
+        ceph_snapdir_dentry_ops;
+extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
+extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+                                         struct dentry *dentry, int err);
+extern void ceph_dentry_lru_add(struct dentry *dn);
+extern void ceph_dentry_lru_touch(struct dentry *dn);
+extern void ceph_dentry_lru_del(struct dentry *dn);
+/*
+ * our d_ops vary depending on whether the inode is live,
+ * snapshotted (read-only), or a virtual ".snap" directory.
+ */
+int ceph_init_dentry(struct dentry *dentry);
+/* ioctl.c */
+extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+/* export.c */
+extern const struct export_operations ceph_export_ops;
+/* debugfs.c */
+extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_cleanup(void);
+extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
+{
+        if (dentry && dentry->d_parent)
+                return dentry->d_parent->d_inode;
+        return NULL;
+}
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_TYPES_H
+#define _FS_CEPH_TYPES_H
+/* needed before including ceph_fs.h */
+#include <linux/in.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+#include "ceph_fs.h"
+#include "ceph_frag.h"
+#include "ceph_hash.h"
+/*
+ * Identify inodes by both their ino AND snapshot id (a u64).
+ */
+struct ceph_vino {
+        u64 ino;
+        u64 snap;
+};
+/* context for the caps reservation mechanism */
+struct ceph_cap_reservation {
+        int count;
+};
+#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..2845422907fc
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,845 @@
+#include "ceph_debug.h"
+#include "super.h"
+#include "decode.h"
+#include <linux/xattr.h>
+#include <linux/slab.h>
+static bool ceph_is_valid_xattr(const char *name)
+{
+        return !strncmp(name, XATTR_SECURITY_PREFIX,
+                        XATTR_SECURITY_PREFIX_LEN) ||
+               !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+               !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+/*
+ * These define virtual xattrs exposing the recursive directory
+ * statistics and layout metadata.
+ */
+struct ceph_vxattr_cb {
+        bool readonly;
+        char *name;
+        size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
+                              size_t size);
+};
+/* directories */
+static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
+                                        size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
+}
+static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
+                                      size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_files);
+}
+static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
+                                        size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_subdirs);
+}
+static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
+                                         size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
+}
+static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
+                                       size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_rfiles);
+}
+static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
+                                         size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_rsubdirs);
+}
+static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
+                                       size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_rbytes);
+}
+static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
+                                       size_t size)
+{
+        return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
+                        (long)ci->i_rctime.tv_nsec);
+}
+static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
+        { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
+        { true, "user.ceph.dir.files", ceph_vxattrcb_files},
+        { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+        { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
+        { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+        { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+        { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+        { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
+        { true, NULL, NULL }
+};
+/* files */
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+                                   size_t size)
+{
+        int ret;
+        ret = snprintf(val, size,
+                "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
+                (unsigned long long)ceph_file_layout_su(ci->i_layout),
+                (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+                (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+        if (ceph_file_layout_pg_preferred(ci->i_layout))
+                ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
+                            (unsigned long long)ceph_file_layout_pg_preferred(
+                                    ci->i_layout));
+        return ret;
+}
+static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+        { true, "user.ceph.layout", ceph_vxattrcb_layout},
+        { NULL, NULL }
+};
+static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
+{
+        if (S_ISDIR(inode->i_mode))
+                return ceph_dir_vxattrs;
+        else if (S_ISREG(inode->i_mode))
+                return ceph_file_vxattrs;
+        return NULL;
+}
+static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
+                                                const char *name)
+{
+        do {
+                if (strcmp(vxattr->name, name) == 0)
+                        return vxattr;
+                vxattr++;
+        } while (vxattr->name);
+        return NULL;
+}
+static int __set_xattr(struct ceph_inode_info *ci,
+                           const char *name, int name_len,
+                           const char *val, int val_len,
+                           int dirty,
+                           int should_free_name, int should_free_val,
+                           struct ceph_inode_xattr **newxattr)
+{
+        struct rb_node **p;
+        struct rb_node *parent = NULL;
+        struct ceph_inode_xattr *xattr = NULL;
+        int c;
+        int new = 0;
+        p = &ci->i_xattrs.index.rb_node;
+        while (*p) {
+                parent = *p;
+                xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+                c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
+                if (c < 0)
+                        p = &(*p)->rb_left;
+                else if (c > 0)
+                        p = &(*p)->rb_right;
+                else {
+                        if (name_len == xattr->name_len)
+                                break;
+                        else if (name_len < xattr->name_len)
+                                p = &(*p)->rb_left;
+                        else
+                                p = &(*p)->rb_right;
+                }
+                xattr = NULL;
+        }
+        if (!xattr) {
+                new = 1;
+                xattr = *newxattr;
+                xattr->name = name;
+                xattr->name_len = name_len;
+                xattr->should_free_name = should_free_name;
+                ci->i_xattrs.count++;
+                dout("__set_xattr count=%d\n", ci->i_xattrs.count);
+        } else {
+                kfree(*newxattr);
+                *newxattr = NULL;
+                if (xattr->should_free_val)
+                        kfree((void *)xattr->val);
+                if (should_free_name) {
+                        kfree((void *)name);
+                        name = xattr->name;
+                }
+                ci->i_xattrs.names_size -= xattr->name_len;
+                ci->i_xattrs.vals_size -= xattr->val_len;
+        }
+        if (!xattr) {
+                pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
+                       &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
+                       xattr->val);
+                return -ENOMEM;
+        }
+        ci->i_xattrs.names_size += name_len;
+        ci->i_xattrs.vals_size += val_len;
+        if (val)
+                xattr->val = val;
+        else
+                xattr->val = "";
+        xattr->val_len = val_len;
+        xattr->dirty = dirty;
+        xattr->should_free_val = (val && should_free_val);
+        if (new) {
+                rb_link_node(&xattr->node, parent, p);
+                rb_insert_color(&xattr->node, &ci->i_xattrs.index);
+                dout("__set_xattr_val p=%p\n", p);
+        }
+        dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
+             ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
+        return 0;
+}
+static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
+                           const char *name)
+{
+        struct rb_node **p;
+        struct rb_node *parent = NULL;
+        struct ceph_inode_xattr *xattr = NULL;
+        int c;
+        p = &ci->i_xattrs.index.rb_node;
+        while (*p) {
+                parent = *p;
+                xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+                c = strncmp(name, xattr->name, xattr->name_len);
+                if (c < 0)
+                        p = &(*p)->rb_left;
+                else if (c > 0)
+                        p = &(*p)->rb_right;
+                else {
+                        dout("__get_xattr %s: found %.*s\n", name,
+                             xattr->val_len, xattr->val);
+                        return xattr;
+                }
+        }
+        dout("__get_xattr %s: not found\n", name);
+        return NULL;
+}
+static void __free_xattr(struct ceph_inode_xattr *xattr)
+{
+        BUG_ON(!xattr);
+        if (xattr->should_free_name)
+                kfree((void *)xattr->name);
+        if (xattr->should_free_val)
+                kfree((void *)xattr->val);
+        kfree(xattr);
+}
+static int __remove_xattr(struct ceph_inode_info *ci,
+                          struct ceph_inode_xattr *xattr)
+{
+        if (!xattr)
+                return -EOPNOTSUPP;
+        rb_erase(&xattr->node, &ci->i_xattrs.index);
+        if (xattr->should_free_name)
+                kfree((void *)xattr->name);
+        if (xattr->should_free_val)
+                kfree((void *)xattr->val);
+        ci->i_xattrs.names_size -= xattr->name_len;
+        ci->i_xattrs.vals_size -= xattr->val_len;
+        ci->i_xattrs.count--;
+        kfree(xattr);
+        return 0;
+}
+static int __remove_xattr_by_name(struct ceph_inode_info *ci,
+                           const char *name)
+{
+        struct rb_node **p;
+        struct ceph_inode_xattr *xattr;
+        int err;
+        p = &ci->i_xattrs.index.rb_node;
+        xattr = __get_xattr(ci, name);
+        err = __remove_xattr(ci, xattr);
+        return err;
+}
+static char *__copy_xattr_names(struct ceph_inode_info *ci,
+                                char *dest)
+{
+        struct rb_node *p;
+        struct ceph_inode_xattr *xattr = NULL;
+        p = rb_first(&ci->i_xattrs.index);
+        dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
+        while (p) {
+                xattr = rb_entry(p, struct ceph_inode_xattr, node);
+                memcpy(dest, xattr->name, xattr->name_len);
+                dest[xattr->name_len] = '\0';
+                dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
+                     xattr->name_len, ci->i_xattrs.names_size);
+                dest += xattr->name_len + 1;
+                p = rb_next(p);
+        }
+        return dest;
+}
+void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
+{
+        struct rb_node *p, *tmp;
+        struct ceph_inode_xattr *xattr = NULL;
+        p = rb_first(&ci->i_xattrs.index);
+        dout("__ceph_destroy_xattrs p=%p\n", p);
+        while (p) {
+                xattr = rb_entry(p, struct ceph_inode_xattr, node);
+                tmp = p;
+                p = rb_next(tmp);
+                dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
+                     xattr->name_len, xattr->name);
+                rb_erase(tmp, &ci->i_xattrs.index);
+                __free_xattr(xattr);
+        }
+        ci->i_xattrs.names_size = 0;
+        ci->i_xattrs.vals_size = 0;
+        ci->i_xattrs.index_version = 0;
+        ci->i_xattrs.count = 0;
+        ci->i_xattrs.index = RB_ROOT;
+}
+static int __build_xattrs(struct inode *inode)
+{
+        u32 namelen;
+        u32 numattr = 0;
+        void *p, *end;
+        u32 len;
+        const char *name, *val;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int xattr_version;
+        struct ceph_inode_xattr **xattrs = NULL;
+        int err = 0;
+        int i;
+        dout("__build_xattrs() len=%d\n",
+             ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
+        if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
+                return 0; /* already built */
+        __ceph_destroy_xattrs(ci);
+start:
+        /* updated internal xattr rb tree */
+        if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
+                p = ci->i_xattrs.blob->vec.iov_base;
+                end = p + ci->i_xattrs.blob->vec.iov_len;
+                ceph_decode_32_safe(&p, end, numattr, bad);
+                xattr_version = ci->i_xattrs.version;
+                spin_unlock(&inode->i_lock);
+                xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
+                                 GFP_NOFS);
+                err = -ENOMEM;
+                if (!xattrs)
+                        goto bad_lock;
+                memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
+                for (i = 0; i < numattr; i++) {
+                        xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
+                                            GFP_NOFS);
+                        if (!xattrs[i])
+                                goto bad_lock;
+                }
+                spin_lock(&inode->i_lock);
+                if (ci->i_xattrs.version != xattr_version) {
+                        /* lost a race, retry */
+                        for (i = 0; i < numattr; i++)
+                                kfree(xattrs[i]);
+                        kfree(xattrs);
+                        goto start;
+                }
+                err = -EIO;
+                while (numattr--) {
+                        ceph_decode_32_safe(&p, end, len, bad);
+                        namelen = len;
+                        name = p;
+                        p += len;
+                        ceph_decode_32_safe(&p, end, len, bad);
+                        val = p;
+                        p += len;
+                        err = __set_xattr(ci, name, namelen, val, len,
+                                          0, 0, 0, &xattrs[numattr]);
+                        if (err < 0)
+                                goto bad;
+                }
+                kfree(xattrs);
+        }
+        ci->i_xattrs.index_version = ci->i_xattrs.version;
+        ci->i_xattrs.dirty = false;
+        return err;
+bad_lock:
+        spin_lock(&inode->i_lock);
+bad:
+        if (xattrs) {
+                for (i = 0; i < numattr; i++)
+                        kfree(xattrs[i]);
+                kfree(xattrs);
+        }
+        ci->i_xattrs.names_size = 0;
+        return err;
+}
+static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
+                                    int val_size)
+{
+        /*
+         * 4 bytes for the length, and additional 4 bytes per each xattr name,
+         * 4 bytes per each value
+         */
+        int size = 4 + ci->i_xattrs.count*(4 + 4) +
+                             ci->i_xattrs.names_size +
+                             ci->i_xattrs.vals_size;
+        dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
+             ci->i_xattrs.count, ci->i_xattrs.names_size,
+             ci->i_xattrs.vals_size);
+        if (name_size)
+                size += 4 + 4 + name_size + val_size;
+        return size;
+}
+/*
+ * If there are dirty xattrs, reencode xattrs into the prealloc_blob
+ * and swap into place.
+ */
+void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
+{
+        struct rb_node *p;
+        struct ceph_inode_xattr *xattr = NULL;
+        void *dest;
+        dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+        if (ci->i_xattrs.dirty) {
+                int need = __get_required_blob_size(ci, 0, 0);
+                BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
+                p = rb_first(&ci->i_xattrs.index);
+                dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
+                ceph_encode_32(&dest, ci->i_xattrs.count);
+                while (p) {
+                        xattr = rb_entry(p, struct ceph_inode_xattr, node);
+                        ceph_encode_32(&dest, xattr->name_len);
+                        memcpy(dest, xattr->name, xattr->name_len);
+                        dest += xattr->name_len;
+                        ceph_encode_32(&dest, xattr->val_len);
+                        memcpy(dest, xattr->val, xattr->val_len);
+                        dest += xattr->val_len;
+                        p = rb_next(p);
+                }
+                /* adjust buffer len; it may be larger than we need */
+                ci->i_xattrs.prealloc_blob->vec.iov_len =
+                        dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
+                if (ci->i_xattrs.blob)
+                        ceph_buffer_put(ci->i_xattrs.blob);
+                ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
+                ci->i_xattrs.prealloc_blob = NULL;
+                ci->i_xattrs.dirty = false;
+        }
+}
+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+                      size_t size)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+        int err;
+        struct ceph_inode_xattr *xattr;
+        struct ceph_vxattr_cb *vxattr = NULL;
+        if (!ceph_is_valid_xattr(name))
+                return -ENODATA;
+        /* let's see if a virtual xattr was requested */
+        if (vxattrs)
+                vxattr = ceph_match_vxattr(vxattrs, name);
+        spin_lock(&inode->i_lock);
+        dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+             ci->i_xattrs.version, ci->i_xattrs.index_version);
+        if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+            (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
+                goto get_xattr;
+        } else {
+                spin_unlock(&inode->i_lock);
+                /* get xattrs from mds (if we don't already have them) */
+                err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+                if (err)
+                        return err;
+        }
+        spin_lock(&inode->i_lock);
+        if (vxattr && vxattr->readonly) {
+                err = vxattr->getxattr_cb(ci, value, size);
+                goto out;
+        }
+        err = __build_xattrs(inode);
+        if (err < 0)
+                goto out;
+get_xattr:
+        err = -ENODATA;  /* == ENOATTR */
+        xattr = __get_xattr(ci, name);
+        if (!xattr) {
+                if (vxattr)
+                        err = vxattr->getxattr_cb(ci, value, size);
+                goto out;
+        }
+        err = -ERANGE;
+        if (size && size < xattr->val_len)
+                goto out;
+        err = xattr->val_len;
+        if (size == 0)
+                goto out;
+        memcpy(value, xattr->val, xattr->val_len);
+out:
+        spin_unlock(&inode->i_lock);
+        return err;
+}
+ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+        u32 vir_namelen = 0;
+        u32 namelen;
+        int err;
+        u32 len;
+        int i;
+        spin_lock(&inode->i_lock);
+        dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
+             ci->i_xattrs.version, ci->i_xattrs.index_version);
+        if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+            (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
+                goto list_xattr;
+        } else {
+                spin_unlock(&inode->i_lock);
+                err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+                if (err)
+                        return err;
+        }
+        spin_lock(&inode->i_lock);
+        err = __build_xattrs(inode);
+        if (err < 0)
+                goto out;
+list_xattr:
+        vir_namelen = 0;
+        /* include virtual dir xattrs */
+        if (vxattrs)
+                for (i = 0; vxattrs[i].name; i++)
+                        vir_namelen += strlen(vxattrs[i].name) + 1;
+        /* adding 1 byte per each variable due to the null termination */
+        namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
+        err = -ERANGE;
+        if (size && namelen > size)
+                goto out;
+        err = namelen;
+        if (size == 0)
+                goto out;
+        names = __copy_xattr_names(ci, names);
+        /* virtual xattr names, too */
+        if (vxattrs)
+                for (i = 0; vxattrs[i].name; i++) {
+                        len = sprintf(names, "%s", vxattrs[i].name);
+                        names += len + 1;
+                }
+out:
+        spin_unlock(&inode->i_lock);
+        return err;
+}
+static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
+                              const char *value, size_t size, int flags)
+{
+        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct inode *parent_inode = dentry->d_parent->d_inode;
+        struct ceph_mds_request *req;
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        int err;
+        int i, nr_pages;
+        struct page **pages = NULL;
+        void *kaddr;
+        /* copy value into some pages */
+        nr_pages = calc_pages_for(0, size);
+        if (nr_pages) {
+                pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
+                if (!pages)
+                        return -ENOMEM;
+                err = -ENOMEM;
+                for (i = 0; i < nr_pages; i++) {
+                        pages[i] = alloc_page(GFP_NOFS);
+                        if (!pages[i]) {
+                                nr_pages = i;
+                                goto out;
+                        }
+                        kaddr = kmap(pages[i]);
+                        memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
+                               min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
+                }
+        }
+        dout("setxattr value=%.*s\n", (int)size, value);
+        /* do request */
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
+                                       USE_AUTH_MDS);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                goto out;
+        }
+        req->r_inode = igrab(inode);
+        req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+        req->r_num_caps = 1;
+        req->r_args.setxattr.flags = cpu_to_le32(flags);
+        req->r_path2 = kstrdup(name, GFP_NOFS);
+        req->r_pages = pages;
+        req->r_num_pages = nr_pages;
+        req->r_data_len = size;
+        dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        ceph_mdsc_put_request(req);
+        dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
+out:
+        if (pages) {
+                for (i = 0; i < nr_pages; i++)
+                        __free_page(pages[i]);
+                kfree(pages);
+        }
+        return err;
+}
+int ceph_setxattr(struct dentry *dentry, const char *name,
+                  const void *value, size_t size, int flags)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+        int err;
+        int name_len = strlen(name);
+        int val_len = size;
+        char *newname = NULL;
+        char *newval = NULL;
+        struct ceph_inode_xattr *xattr = NULL;
+        int issued;
+        int required_blob_size;
+        if (ceph_snap(inode) != CEPH_NOSNAP)
+                return -EROFS;
+        if (!ceph_is_valid_xattr(name))
+                return -EOPNOTSUPP;
+        if (vxattrs) {
+                struct ceph_vxattr_cb *vxattr =
+                        ceph_match_vxattr(vxattrs, name);
+                if (vxattr && vxattr->readonly)
+                        return -EOPNOTSUPP;
+        }
+        /* preallocate memory for xattr name, value, index node */
+        err = -ENOMEM;
+        newname = kmalloc(name_len + 1, GFP_NOFS);
+        if (!newname)
+                goto out;
+        memcpy(newname, name, name_len + 1);
+        if (val_len) {
+                newval = kmalloc(val_len + 1, GFP_NOFS);
+                if (!newval)
+                        goto out;
+                memcpy(newval, value, val_len);
+                newval[val_len] = '\0';
+        }
+        xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
+        if (!xattr)
+                goto out;
+        spin_lock(&inode->i_lock);
+retry:
+        issued = __ceph_caps_issued(ci, NULL);
+        if (!(issued & CEPH_CAP_XATTR_EXCL))
+                goto do_sync;
+        __build_xattrs(inode);
+        required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+        if (!ci->i_xattrs.prealloc_blob ||
+            required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+                struct ceph_buffer *blob = NULL;
+                spin_unlock(&inode->i_lock);
+                dout(" preaallocating new blob size=%d\n", required_blob_size);
+                blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+                if (!blob)
+                        goto out;
+                spin_lock(&inode->i_lock);
+                if (ci->i_xattrs.prealloc_blob)
+                        ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+                ci->i_xattrs.prealloc_blob = blob;
+                goto retry;
+        }
+        dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
+        err = __set_xattr(ci, newname, name_len, newval,
+                          val_len, 1, 1, 1, &xattr);
+        __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+        ci->i_xattrs.dirty = true;
+        inode->i_ctime = CURRENT_TIME;
+        spin_unlock(&inode->i_lock);
+        return err;
+do_sync:
+        spin_unlock(&inode->i_lock);
+        err = ceph_sync_setxattr(dentry, name, value, size, flags);
+out:
+        kfree(newname);
+        kfree(newval);
+        kfree(xattr);
+        return err;
+}
+static int ceph_send_removexattr(struct dentry *dentry, const char *name)
+{
+        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct inode *inode = dentry->d_inode;
+        struct inode *parent_inode = dentry->d_parent->d_inode;
+        struct ceph_mds_request *req;
+        int err;
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
+                                       USE_AUTH_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->r_inode = igrab(inode);
+        req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+        req->r_num_caps = 1;
+        req->r_path2 = kstrdup(name, GFP_NOFS);
+        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        ceph_mdsc_put_request(req);
+        return err;
+}
+int ceph_removexattr(struct dentry *dentry, const char *name)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+        int issued;
+        int err;
+        if (ceph_snap(inode) != CEPH_NOSNAP)
+                return -EROFS;
+        if (!ceph_is_valid_xattr(name))
+                return -EOPNOTSUPP;
+        if (vxattrs) {
+                struct ceph_vxattr_cb *vxattr =
+                        ceph_match_vxattr(vxattrs, name);
+                if (vxattr && vxattr->readonly)
+                        return -EOPNOTSUPP;
+        }
+        spin_lock(&inode->i_lock);
+        __build_xattrs(inode);
+        issued = __ceph_caps_issued(ci, NULL);
+        dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
+        if (!(issued & CEPH_CAP_XATTR_EXCL))
+                goto do_sync;
+        err = __remove_xattr_by_name(ceph_inode(inode), name);
+        __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+        ci->i_xattrs.dirty = true;
+        inode->i_ctime = CURRENT_TIME;
+        spin_unlock(&inode->i_lock);
+        return err;
+do_sync:
+        spin_unlock(&inode->i_lock);
+        err = ceph_send_removexattr(dentry, name);
+        return err;
+}
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 20692fbfdb24..cfd1ce34e0bc 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -136,7 +136,7 @@ asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
                return 0;
        }
-        ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to lenght octet */
+        ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */
        if ((ch) == ASN1_ENUM)  /* if ch value is ENUM, 0xa */
                *val = *(++(ctx->pointer)); /* value has enum value */
        else
@@ -492,17 +492,13 @@ compare_oid(unsigned long *oid1, unsigned int oid1len,
 int
 decode_negTokenInit(unsigned char *security_blob, int length,
-                    enum securityEnum *secType)
+                    struct TCP_Server_Info *server)
 {
        struct asn1_ctx ctx;
        unsigned char *end;
        unsigned char *sequence_end;
        unsigned long *oid = NULL;
        unsigned int cls, con, tag, oidlen, rc;
-        bool use_ntlmssp = false;
-        bool use_kerberos = false;
-        bool use_kerberosu2u = false;
-        bool use_mskerberos = false;
        /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
@@ -510,11 +506,11 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        /* GSSAPI header */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding negTokenInit header"));
+                cFYI(1, "Error decoding negTokenInit header");
                return 0;
        } else if ((cls != ASN1_APL) || (con != ASN1_CON)
                   || (tag != ASN1_EOC)) {
-                cFYI(1, ("cls = %d con = %d tag = %d", cls, con, tag));
+                cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag);
                return 0;
        }
@@ -535,56 +531,52 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        /* SPNEGO OID not present or garbled -- bail out */
        if (!rc) {
-                cFYI(1, ("Error decoding negTokenInit header"));
+                cFYI(1, "Error decoding negTokenInit header");
                return 0;
        }
        /* SPNEGO */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding negTokenInit"));
+                cFYI(1, "Error decoding negTokenInit");
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
                   || (tag != ASN1_EOC)) {
-                cFYI(1,
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
-                     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
+                     cls, con, tag, end, *end);
-                      cls, con, tag, end, *end));
                return 0;
        }
        /* negTokenInit */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding negTokenInit"));
+                cFYI(1, "Error decoding negTokenInit");
                return 0;
        } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
                   || (tag != ASN1_SEQ)) {
-                cFYI(1,
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
-                     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
+                     cls, con, tag, end, *end);
-                      cls, con, tag, end, *end));
                return 0;
        }
        /* sequence */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+                cFYI(1, "Error decoding 2nd part of negTokenInit");
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
                   || (tag != ASN1_EOC)) {
-                cFYI(1,
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
-                     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
+                     cls, con, tag, end, *end);
-                      cls, con, tag, end, *end));
                return 0;
        }
        /* sequence of */
        if (asn1_header_decode
            (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+                cFYI(1, "Error decoding 2nd part of negTokenInit");
                return 0;
        } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
                   || (tag != ASN1_SEQ)) {
-                cFYI(1,
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
-                     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
+                     cls, con, tag, end, *end);
-                      cls, con, tag, end, *end));
                return 0;
        }
@@ -592,37 +584,33 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        while (!asn1_eoc_decode(&ctx, sequence_end)) {
                rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
                if (!rc) {
-                        cFYI(1,
+                        cFYI(1, "Error decoding negTokenInit hdr exit2");
-                             ("Error decoding negTokenInit hdr exit2"));
                        return 0;
                }
                if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
                        if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
-                                cFYI(1, ("OID len = %d oid = 0x%lx 0x%lx "
+                                cFYI(1, "OID len = %d oid = 0x%lx 0x%lx "
-                                         "0x%lx 0x%lx", oidlen, *oid,
+                                        "0x%lx 0x%lx", oidlen, *oid,
-                                         *(oid + 1), *(oid + 2), *(oid + 3)));
+                                        *(oid + 1), *(oid + 2), *(oid + 3));
                                if (compare_oid(oid, oidlen, MSKRB5_OID,
-                                                MSKRB5_OID_LEN) &&
+                                                MSKRB5_OID_LEN))
-                                                !use_mskerberos)
+                                        server->sec_mskerberos = true;
-                                        use_mskerberos = true;
                                else if (compare_oid(oid, oidlen, KRB5U2U_OID,
-                                                     KRB5U2U_OID_LEN) &&
+                                                     KRB5U2U_OID_LEN))
-                                                     !use_kerberosu2u)
+                                        server->sec_kerberosu2u = true;
-                                        use_kerberosu2u = true;
                                else if (compare_oid(oid, oidlen, KRB5_OID,
-                                                     KRB5_OID_LEN) &&
+                                                     KRB5_OID_LEN))
-                                                     !use_kerberos)
+                                        server->sec_kerberos = true;
-                                        use_kerberos = true;
                                else if (compare_oid(oid, oidlen, NTLMSSP_OID,
                                                     NTLMSSP_OID_LEN))
-                                        use_ntlmssp = true;
+                                        server->sec_ntlmssp = true;
                                kfree(oid);
                        }
                } else {
-                        cFYI(1, ("Should be an oid what is going on?"));
+                        cFYI(1, "Should be an oid what is going on?");
                }
        }
@@ -632,54 +620,47 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                   no mechListMic (e.g. NTLMSSP instead of KRB5) */
                if (ctx.error == ASN1_ERR_DEC_EMPTY)
                        goto decode_negtoken_exit;
-                cFYI(1, ("Error decoding last part negTokenInit exit3"));
+                cFYI(1, "Error decoding last part negTokenInit exit3");
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
                /* tag = 3 indicating mechListMIC */
-                cFYI(1, ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
+                cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
-                         cls, con, tag, end, *end));
+                        cls, con, tag, end, *end);
                return 0;
        }
        /* sequence */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding last part negTokenInit exit5"));
+                cFYI(1, "Error decoding last part negTokenInit exit5");
                return 0;
        } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
                   || (tag != ASN1_SEQ)) {
-                cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)",
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)",
-                        cls, con, tag, end, *end));
+                        cls, con, tag, end, *end);
        }
        /* sequence of */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding last part negTokenInit exit 7"));
+                cFYI(1, "Error decoding last part negTokenInit exit 7");
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-                cFYI(1, ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
+                cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
-                         cls, con, tag, end, *end));
+                        cls, con, tag, end, *end);
                return 0;
        }
        /* general string */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding last part negTokenInit exit9"));
+                cFYI(1, "Error decoding last part negTokenInit exit9");
                return 0;
        } else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
                   || (tag != ASN1_GENSTR)) {
-                cFYI(1, ("Exit10 cls = %d con = %d tag = %d end = %p (%d)",
+                cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)",
-                         cls, con, tag, end, *end));
+                        cls, con, tag, end, *end);
                return 0;
        }
-        cFYI(1, ("Need to call asn1_octets_decode() function for %s",
+        cFYI(1, "Need to call asn1_octets_decode() function for %s",
-                 ctx.pointer)); /* is this UTF-8 or ASCII? */
+                ctx.pointer);   /* is this UTF-8 or ASCII? */
 decode_negtoken_exit:
-        if (use_kerberos)
-                *secType = Kerberos;
-        else if (use_mskerberos)
-                *secType = MSKerberos;
-        else if (use_ntlmssp)
-                *secType = RawNTLMSSP;
        return 1;
 }
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 42cec2a7c0cf..4fce6e61b34e 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -60,10 +60,10 @@ cifs_dump_mem(char *label, void *data, int length)
 #ifdef CONFIG_CIFS_DEBUG2
 void cifs_dump_detail(struct smb_hdr *smb)
 {
-        cERROR(1, ("Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
+        cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
                  smb->Command, smb->Status.CifsError,
-                  smb->Flags, smb->Flags2, smb->Mid, smb->Pid));
+                  smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
-        cERROR(1, ("smb buf %p len %d", smb, smbCalcSize_LE(smb)));
+        cERROR(1, "smb buf %p len %d", smb, smbCalcSize_LE(smb));
 }
@@ -75,25 +75,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
        if (server == NULL)
                return;
-        cERROR(1, ("Dump pending requests:"));
+        cERROR(1, "Dump pending requests:");
        spin_lock(&GlobalMid_Lock);
        list_for_each(tmp, &server->pending_mid_q) {
                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+                cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
                        mid_entry->midState,
                        (int)mid_entry->command,
                        mid_entry->pid,
                        mid_entry->tsk,
-                        mid_entry->mid));
+                        mid_entry->mid);
 #ifdef CONFIG_CIFS_STATS2
-                cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld",
+                cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
                        mid_entry->largeBuf,
                        mid_entry->resp_buf,
                        mid_entry->when_received,
-                        jiffies));
+                        jiffies);
 #endif /* STATS2 */
-                cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp,
+                cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp,
-                          mid_entry->multiEnd));
+                          mid_entry->multiEnd);
                if (mid_entry->resp_buf) {
                        cifs_dump_detail(mid_entry->resp_buf);
                        cifs_dump_mem("existing buf: ",
@@ -716,7 +716,7 @@ static const struct file_operations cifs_multiuser_mount_proc_fops = {
 static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
 {
-        seq_printf(m, "0x%x\n", extended_security);
+        seq_printf(m, "0x%x\n", global_secflags);
        return 0;
 }
@@ -744,13 +744,13 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
                /* single char or single char followed by null */
                c = flags_string[0];
                if (c == '0' || c == 'n' || c == 'N') {
-                        extended_security = CIFSSEC_DEF; /* default */
+                        global_secflags = CIFSSEC_DEF; /* default */
                        return count;
                } else if (c == '1' || c == 'y' || c == 'Y') {
-                        extended_security = CIFSSEC_MAX;
+                        global_secflags = CIFSSEC_MAX;
                        return count;
                } else if (!isdigit(c)) {
-                        cERROR(1, ("invalid flag %c", c));
+                        cERROR(1, "invalid flag %c", c);
                        return -EINVAL;
                }
        }
@@ -758,26 +758,26 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
        flags = simple_strtoul(flags_string, NULL, 0);
-        cFYI(1, ("sec flags 0x%x", flags));
+        cFYI(1, "sec flags 0x%x", flags);
        if (flags <= 0)  {
-                cERROR(1, ("invalid security flags %s", flags_string));
+                cERROR(1, "invalid security flags %s", flags_string);
                return -EINVAL;
        }
        if (flags & ~CIFSSEC_MASK) {
-                cERROR(1, ("attempt to set unsupported security flags 0x%x",
+                cERROR(1, "attempt to set unsupported security flags 0x%x",
-                        flags & ~CIFSSEC_MASK));
+                        flags & ~CIFSSEC_MASK);
                return -EINVAL;
        }
        /* flags look ok - update the global security flags for cifs module */
-        extended_security = flags;
+        global_secflags = flags;
-        if (extended_security & CIFSSEC_MUST_SIGN) {
+        if (global_secflags & CIFSSEC_MUST_SIGN) {
                /* requiring signing implies signing is allowed */
-                extended_security |= CIFSSEC_MAY_SIGN;
+                global_secflags |= CIFSSEC_MAY_SIGN;
-                cFYI(1, ("packet signing now required"));
+                cFYI(1, "packet signing now required");
-        } else if ((extended_security & CIFSSEC_MAY_SIGN) == 0) {
+        } else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) {
-                cFYI(1, ("packet signing disabled"));
+                cFYI(1, "packet signing disabled");
        }
        /* BB should we turn on MAY flags for other MUST options? */
        return count;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 5eb3b83bbfa7..aa316891ac0c 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -43,34 +43,54 @@ void dump_smb(struct smb_hdr *, int);
 */
 #ifdef CIFS_DEBUG
 /* information message: e.g., configuration, major event */
 extern int cifsFYI;
-#define cifsfyi(format,arg...) if (cifsFYI & CIFS_INFO) printk(KERN_DEBUG " " __FILE__ ": " format "\n" "" , ## arg)
+#define cifsfyi(fmt, arg...)                                            \
+do {                                                                    \
+        if (cifsFYI & CIFS_INFO)                                        \
+                printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg);    \
+} while (0)
-#define cFYI(button,prspec) if (button) cifsfyi prspec
+#define cFYI(set, fmt, arg...)                  \
+do {                                            \
+        if (set)                                \
+                cifsfyi(fmt, ##arg);            \
+} while (0)
-#define cifswarn(format, arg...) printk(KERN_WARNING ": " format "\n" , ## arg)
+#define cifswarn(fmt, arg...)                   \
+        printk(KERN_WARNING fmt "\n", ##arg)
 /* debug event message: */
 extern int cifsERROR;
-#define cEVENT(format,arg...) if (cifsERROR) printk(KERN_EVENT __FILE__ ": " format "\n" , ## arg)
+#define cEVENT(fmt, arg...)                                             \
+do {                                                                    \
+        if (cifsERROR)                                                  \
+                printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg);    \
+} while (0)
 /* error event message: e.g., i/o error */
-#define cifserror(format,arg...) if (cifsERROR) printk(KERN_ERR " CIFS VFS: " format "\n" "" , ## arg)
+#define cifserror(fmt, arg...)                                  \
+do {                                                            \
+        if (cifsERROR)                                          \
+                printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg);  \
+} while (0)
-#define cERROR(button, prspec) if (button) cifserror prspec
+#define cERROR(set, fmt, arg...)                \
+do {                                            \
+        if (set)                                \
+                cifserror(fmt, ##arg);          \
+} while (0)
 /*
 *      debug OFF
 *      ---------
 */
 #else           /* _CIFS_DEBUG */
-#define cERROR(button, prspec)
+#define cERROR(set, fmt, arg...)
-#define cEVENT(format, arg...)
+#define cEVENT(fmt, arg...)
-#define cFYI(button, prspec)
+#define cFYI(set, fmt, arg...)
-#define cifserror(format, arg...)
+#define cifserror(fmt, arg...)
 #endif          /* _CIFS_DEBUG */
 #endif                          /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b44ce0a0711c..ac19a6f3dae0 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -15,6 +15,7 @@
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/vfs.h>
 #include <linux/fs.h>
 #include "cifsglob.h"
@@ -54,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
 * Extracts sharename form full UNC.
 * i.e. strips from UNC trailing path that is not part of share
 * name and fixup missing '\' in the begining of DFS node refferal
- * if neccessary.
+ * if necessary.
 * Returns pointer to share name on success or ERR_PTR on error.
 * Caller is responsible for freeing returned string.
 */
@@ -84,8 +85,8 @@ static char *cifs_get_share_name(const char *node_name)
        /* find server name end */
        pSep = memchr(UNC+2, '\\', len-2);
        if (!pSep) {
-                cERROR(1, ("%s: no server name end in node name: %s",
+                cERROR(1, "%s: no server name end in node name: %s",
-                        __func__, node_name));
+                        __func__, node_name);
                kfree(UNC);
                return ERR_PTR(-EINVAL);
        }
@@ -141,8 +142,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
        if (rc != 0) {
-                cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
+                cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
-                          __func__, *devname, rc));
+                          __func__, *devname, rc);
                goto compose_mount_options_err;
        }
        /* md_len = strlen(...) + 12 for 'sep+prefixpath='
@@ -216,8 +217,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
                strcat(mountdata, fullpath + ref->path_consumed);
        }
-        /*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
+        /*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/
-        /*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
+        /*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/
 compose_mount_options_out:
        kfree(srvIP);
@@ -293,11 +294,11 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 static void dump_referral(const struct dfs_info3_param *ref)
 {
-        cFYI(1, ("DFS: ref path: %s", ref->path_name));
+        cFYI(1, "DFS: ref path: %s", ref->path_name);
-        cFYI(1, ("DFS: node path: %s", ref->node_name));
+        cFYI(1, "DFS: node path: %s", ref->node_name);
-        cFYI(1, ("DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type));
+        cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type);
-        cFYI(1, ("DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
+        cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
-                                ref->path_consumed));
+                                ref->path_consumed);
 }
@@ -313,7 +314,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        int rc = 0;
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
-        cFYI(1, ("in %s", __func__));
+        cFYI(1, "in %s", __func__);
        BUG_ON(IS_ROOT(dentry));
        xid = GetXid();
@@ -351,15 +352,15 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
                /* connect to a node */
                len = strlen(referrals[i].node_name);
                if (len < 2) {
-                        cERROR(1, ("%s: Net Address path too short: %s",
+                        cERROR(1, "%s: Net Address path too short: %s",
-                                        __func__, referrals[i].node_name));
+                                        __func__, referrals[i].node_name);
                        rc = -EINVAL;
                        goto out_err;
                }
                mnt = cifs_dfs_do_refmount(nd->path.mnt,
                                nd->path.dentry, referrals + i);
-                cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
+                cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
-                                        referrals[i].node_name, mnt));
+                                        referrals[i].node_name, mnt);
                /* complete mount procedure if we accured submount */
                if (!IS_ERR(mnt))
@@ -377,7 +378,7 @@ out:
        FreeXid(xid);
        free_dfs_info_array(referrals, num_referrals);
        kfree(full_path);
-        cFYI(1, ("leaving %s" , __func__));
+        cFYI(1, "leaving %s" , __func__);
        return ERR_PTR(rc);
 out_err:
        path_put(&nd->path);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 4797787c6a44..246a167cb913 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -18,6 +18,8 @@
 #ifndef _CIFS_FS_SB_H
 #define _CIFS_FS_SB_H
+#include <linux/backing-dev.h>
 #define CIFS_MOUNT_NO_PERM      1 /* do not do client vfs_perm check */
 #define CIFS_MOUNT_SET_UID      2 /* set current's euid in create etc. */
 #define CIFS_MOUNT_SERVER_INUM  4 /* inode numbers from uniqueid from server  */
@@ -50,5 +52,6 @@ struct cifs_sb_info {
 #ifdef CONFIG_CIFS_DFS_UPCALL
        char   *mountdata; /* mount options received at mount time */
 #endif
+        struct backing_dev_info bdi;
 };
 #endif                          /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 8ec7736ce954..379bd7d9c05f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -20,6 +20,7 @@
 */
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <keys/user-type.h>
 #include <linux/key-type.h>
@@ -132,9 +133,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
        /* for now, only sec=krb5 and sec=mskrb5 are valid */
-        if (server->secType == Kerberos)
+        if (server->sec_kerberos)
                sprintf(dp, ";sec=krb5");
-        else if (server->secType == MSKerberos)
+        else if (server->sec_mskerberos)
                sprintf(dp, ";sec=mskrb5");
        else
                goto out;
@@ -148,7 +149,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
        sprintf(dp, ";pid=0x%x", current->pid);
-        cFYI(1, ("key description = %s", description));
+        cFYI(1, "key description = %s", description);
        spnego_key = request_key(&cifs_spnego_key_type, description, "");
 #ifdef CONFIG_CIFS_DEBUG2
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 714a542cbafc..430f510a1720 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -19,6 +19,7 @@
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifs_unicode.h"
 #include "cifs_uniupr.h"
 #include "cifspdu.h"
@@ -199,9 +200,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
                /* works for 2.4.0 kernel or later */
                charlen = codepage->char2uni(from, len, &wchar_to[i]);
                if (charlen < 1) {
-                        cERROR(1,
+                        cERROR(1, "strtoUCS: char2uni of %d returned %d",
-                               ("strtoUCS: char2uni of %d returned %d",
+                                (int)*from, charlen);
-                                (int)*from, charlen));
                        /* A question mark */
                        to[i] = cpu_to_le16(0x003f);
                        charlen = 1;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7dfe0842a6f6..85d7cf7ff2c8 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -22,6 +22,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsacl.h"
@@ -86,11 +87,11 @@ int match_sid(struct cifs_sid *ctsid)
                                continue; /* all sub_auth values do not match */
                }
-                cFYI(1, ("matching sid: %s\n", wksidarr[i].sidname));
+                cFYI(1, "matching sid: %s\n", wksidarr[i].sidname);
                return 0; /* sids compare/match */
        }
-        cFYI(1, ("No matching sid"));
+        cFYI(1, "No matching sid");
        return -1;
 }
@@ -207,14 +208,14 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
                        *pbits_to_set &= ~S_IXUGO;
                return;
        } else if (type != ACCESS_ALLOWED) {
-                cERROR(1, ("unknown access control type %d", type));
+                cERROR(1, "unknown access control type %d", type);
                return;
        }
        /* else ACCESS_ALLOWED type */
        if (flags & GENERIC_ALL) {
                *pmode |= (S_IRWXUGO & (*pbits_to_set));
-                cFYI(DBG2, ("all perms"));
+                cFYI(DBG2, "all perms");
                return;
        }
        if ((flags & GENERIC_WRITE) ||
@@ -227,7 +228,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
                        ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
                *pmode |= (S_IXUGO & (*pbits_to_set));
-        cFYI(DBG2, ("access flags 0x%x mode now 0x%x", flags, *pmode));
+        cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode);
        return;
 }
@@ -256,7 +257,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
        if (mode & S_IXUGO)
                *pace_flags |= SET_FILE_EXEC_RIGHTS;
-        cFYI(DBG2, ("mode: 0x%x, access flags now 0x%x", mode, *pace_flags));
+        cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags);
        return;
 }
@@ -296,24 +297,24 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
        /* validate that we do not go past end of acl */
        if (le16_to_cpu(pace->size) < 16) {
-                cERROR(1, ("ACE too small, %d", le16_to_cpu(pace->size)));
+                cERROR(1, "ACE too small %d", le16_to_cpu(pace->size));
                return;
        }
        if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
-                cERROR(1, ("ACL too small to parse ACE"));
+                cERROR(1, "ACL too small to parse ACE");
                return;
        }
        num_subauth = pace->sid.num_subauth;
        if (num_subauth) {
                int i;
-                cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d",
+                cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d",
                        pace->sid.revision, pace->sid.num_subauth, pace->type,
-                        pace->flags, le16_to_cpu(pace->size)));
+                        pace->flags, le16_to_cpu(pace->size));
                for (i = 0; i < num_subauth; ++i) {
-                        cFYI(1, ("ACE sub_auth[%d]: 0x%x", i,
+                        cFYI(1, "ACE sub_auth[%d]: 0x%x", i,
-                                le32_to_cpu(pace->sid.sub_auth[i])));
+                                le32_to_cpu(pace->sid.sub_auth[i]));
                }
                /* BB add length check to make sure that we do not have huge
@@ -346,13 +347,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
        /* validate that we do not go past end of acl */
        if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
-                cERROR(1, ("ACL too small to parse DACL"));
+                cERROR(1, "ACL too small to parse DACL");
                return;
        }
-        cFYI(DBG2, ("DACL revision %d size %d num aces %d",
+        cFYI(DBG2, "DACL revision %d size %d num aces %d",
                le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
-                le32_to_cpu(pdacl->num_aces)));
+                le32_to_cpu(pdacl->num_aces));
        /* reset rwx permissions for user/group/other.
           Also, if num_aces is 0 i.e. DACL has no ACEs,
@@ -436,25 +437,25 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
        /* validate that we do not go past end of ACL - sid must be at least 8
           bytes long (assuming no sub-auths - e.g. the null SID */
        if (end_of_acl < (char *)psid + 8) {
-                cERROR(1, ("ACL too small to parse SID %p", psid));
+                cERROR(1, "ACL too small to parse SID %p", psid);
                return -EINVAL;
        }
        if (psid->num_subauth) {
 #ifdef CONFIG_CIFS_DEBUG2
                int i;
-                cFYI(1, ("SID revision %d num_auth %d",
+                cFYI(1, "SID revision %d num_auth %d",
-                        psid->revision, psid->num_subauth));
+                        psid->revision, psid->num_subauth);
                for (i = 0; i < psid->num_subauth; i++) {
-                        cFYI(1, ("SID sub_auth[%d]: 0x%x ", i,
+                        cFYI(1, "SID sub_auth[%d]: 0x%x ", i,
-                                le32_to_cpu(psid->sub_auth[i])));
+                                le32_to_cpu(psid->sub_auth[i]));
                }
                /* BB add length check to make sure that we do not have huge
                        num auths and therefore go off the end */
-                cFYI(1, ("RID 0x%x",
+                cFYI(1, "RID 0x%x",
-                        le32_to_cpu(psid->sub_auth[psid->num_subauth-1])));
+                        le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
 #endif
        }
@@ -481,11 +482,11 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
                                le32_to_cpu(pntsd->gsidoffset));
        dacloffset = le32_to_cpu(pntsd->dacloffset);
        dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
-        cFYI(DBG2, ("revision %d type 0x%x ooffset 0x%x goffset 0x%x "
+        cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x "
                 "sacloffset 0x%x dacloffset 0x%x",
                 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
                 le32_to_cpu(pntsd->gsidoffset),
-                 le32_to_cpu(pntsd->sacloffset), dacloffset));
+                 le32_to_cpu(pntsd->sacloffset), dacloffset);
 /*      cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
        rc = parse_sid(owner_sid_ptr, end_of_acl);
        if (rc)
@@ -499,7 +500,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
                parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
                           group_sid_ptr, fattr);
        else
-                cFYI(1, ("no ACL")); /* BB grant all or default perms? */
+                cFYI(1, "no ACL"); /* BB grant all or default perms? */
 /*      cifscred->uid = owner_sid_ptr->rid;
        cifscred->gid = group_sid_ptr->rid;
@@ -562,7 +563,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
        FreeXid(xid);
-        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
        return pntsd;
 }
@@ -580,12 +581,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
                         &fid, &oplock, NULL, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                cERROR(1, ("Unable to open file to get ACL"));
+                cERROR(1, "Unable to open file to get ACL");
                goto out;
        }
        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
-        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
        CIFSSMBClose(xid, cifs_sb->tcon, fid);
 out:
@@ -620,7 +621,7 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
        FreeXid(xid);
-        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+        cFYI(DBG2, "SetCIFSACL rc = %d", rc);
        return rc;
 }
@@ -637,12 +638,12 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
                         &fid, &oplock, NULL, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                cERROR(1, ("Unable to open file to set ACL"));
+                cERROR(1, "Unable to open file to set ACL");
                goto out;
        }
        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
-        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+        cFYI(DBG2, "SetCIFSACL rc = %d", rc);
        CIFSSMBClose(xid, cifs_sb->tcon, fid);
 out:
@@ -658,7 +659,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
        struct cifsFileInfo *open_file;
        int rc;
-        cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+        cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
        open_file = find_readable_file(CIFS_I(inode));
        if (!open_file)
@@ -678,7 +679,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
        u32 acllen = 0;
        int rc = 0;
-        cFYI(DBG2, ("converting ACL to mode for %s", path));
+        cFYI(DBG2, "converting ACL to mode for %s", path);
        if (pfid)
                pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
@@ -689,7 +690,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
        if (pntsd)
                rc = parse_sec_desc(pntsd, acllen, fattr);
        if (rc)
-                cFYI(1, ("parse sec desc failed rc = %d", rc));
+                cFYI(1, "parse sec desc failed rc = %d", rc);
        kfree(pntsd);
        return;
@@ -703,7 +704,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
        struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
-        cFYI(DBG2, ("set ACL from mode for %s", path));
+        cFYI(DBG2, "set ACL from mode for %s", path);
        /* Get the security descriptor */
        pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
@@ -720,19 +721,19 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
                                        DEFSECDESCLEN : secdesclen;
                pnntsd = kmalloc(secdesclen, GFP_KERNEL);
                if (!pnntsd) {
-                        cERROR(1, ("Unable to allocate security descriptor"));
+                        cERROR(1, "Unable to allocate security descriptor");
                        kfree(pntsd);
                        return -ENOMEM;
                }
                rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
-                cFYI(DBG2, ("build_sec_desc rc: %d", rc));
+                cFYI(DBG2, "build_sec_desc rc: %d", rc);
                if (!rc) {
                        /* Set the security descriptor */
                        rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
-                        cFYI(DBG2, ("set_cifs_acl rc: %d", rc));
+                        cFYI(DBG2, "set_cifs_acl rc: %d", rc);
                }
                kfree(pnntsd);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7efe1745494d..847628dfdc44 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -20,6 +20,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifs_debug.h"
@@ -102,7 +103,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
                if (iov[i].iov_len == 0)
                        continue;
                if (iov[i].iov_base == NULL) {
-                        cERROR(1, ("null iovec entry"));
+                        cERROR(1, "null iovec entry");
                        return -EIO;
                }
                /* The first entry includes a length field (which does not get
@@ -180,8 +181,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
        /* Do not need to verify session setups with signature "BSRSPYL "  */
        if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0)
-                cFYI(1, ("dummy signature received for smb command 0x%x",
+                cFYI(1, "dummy signature received for smb command 0x%x",
-                        cifs_pdu->Command));
+                        cifs_pdu->Command);
        /* save off the origiginal signature so we can modify the smb and check
                its signature against what the server sent */
@@ -290,7 +291,7 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
        if (password)
                strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
-        if (!encrypt && extended_security & CIFSSEC_MAY_PLNTXT) {
+        if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
                memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
                memcpy(lnm_session_key, password_with_pad,
                        CIFS_ENCPWD_SIZE);
@@ -397,7 +398,7 @@ void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
        /* calculate buf->ntlmv2_hash */
        rc = calc_ntlmv2_hash(ses, nls_cp);
        if (rc)
-                cERROR(1, ("could not get v2 hash rc %d", rc));
+                cERROR(1, "could not get v2 hash rc %d", rc);
        CalcNTLMv2_response(ses, resp_buf);
        /* now calculate the MAC key for NTLMv2 */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c6a03627176..78c02eb4cb1f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -49,10 +49,6 @@
 #include "cifs_spnego.h"
 #define CIFS_MAGIC_NUMBER 0xFF534D42    /* the first four bytes of SMB PDUs */
-#ifdef CONFIG_CIFS_QUOTA
-static const struct quotactl_ops cifs_quotactl_ops;
-#endif /* QUOTA */
 int cifsFYI = 0;
 int cifsERROR = 1;
 int traceSMB = 0;
@@ -61,7 +57,7 @@ unsigned int experimEnabled = 0;
 unsigned int linuxExtEnabled = 1;
 unsigned int lookupCacheEnabled = 1;
 unsigned int multiuser_mount = 0;
-unsigned int extended_security = CIFSSEC_DEF;
+unsigned int global_secflags = CIFSSEC_DEF;
 /* unsigned int ntlmv2_support = 0; */
 unsigned int sign_CIFS_PDUs = 1;
 static const struct super_operations cifs_super_ops;
@@ -86,8 +82,6 @@ extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
-extern struct kmem_cache *cifs_oplock_cachep;
 static int
 cifs_read_super(struct super_block *sb, void *data,
                const char *devname, int silent)
@@ -103,6 +97,12 @@ cifs_read_super(struct super_block *sb, void *data,
        if (cifs_sb == NULL)
                return -ENOMEM;
+        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+        if (rc) {
+                kfree(cifs_sb);
+                return rc;
+        }
 #ifdef CONFIG_CIFS_DFS_UPCALL
        /* copy mount params to sb for use in submounts */
        /* BB: should we move this after the mount so we
@@ -115,6 +115,7 @@ cifs_read_super(struct super_block *sb, void *data,
                int len = strlen(data);
                cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
                if (cifs_sb->mountdata == NULL) {
+                        bdi_destroy(&cifs_sb->bdi);
                        kfree(sb->s_fs_info);
                        sb->s_fs_info = NULL;
                        return -ENOMEM;
@@ -128,19 +129,16 @@ cifs_read_super(struct super_block *sb, void *data,
        if (rc) {
                if (!silent)
-                        cERROR(1,
+                        cERROR(1, "cifs_mount failed w/return code = %d", rc);
-                               ("cifs_mount failed w/return code = %d", rc));
                goto out_mount_failed;
        }
        sb->s_magic = CIFS_MAGIC_NUMBER;
        sb->s_op = &cifs_super_ops;
+        sb->s_bdi = &cifs_sb->bdi;
 /*      if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
            sb->s_blocksize =
                cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
-#ifdef CONFIG_CIFS_QUOTA
-        sb->s_qcop = &cifs_quotactl_ops;
-#endif
        sb->s_blocksize = CIFS_MAX_MSGSIZE;
        sb->s_blocksize_bits = 14;      /* default 2**14 = CIFS_MAX_MSGSIZE */
        inode = cifs_root_iget(sb, ROOT_I);
@@ -160,7 +158,7 @@ cifs_read_super(struct super_block *sb, void *data,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-                cFYI(1, ("export ops supported"));
+                cFYI(1, "export ops supported");
                sb->s_export_op = &cifs_export_ops;
        }
 #endif /* EXPERIMENTAL */
@@ -168,7 +166,7 @@ cifs_read_super(struct super_block *sb, void *data,
        return 0;
 out_no_root:
-        cERROR(1, ("cifs_read_super: get root inode failed"));
+        cERROR(1, "cifs_read_super: get root inode failed");
        if (inode)
                iput(inode);
@@ -183,6 +181,7 @@ out_mount_failed:
                }
 #endif
                unload_nls(cifs_sb->local_nls);
+                bdi_destroy(&cifs_sb->bdi);
                kfree(cifs_sb);
        }
        return rc;
@@ -194,10 +193,10 @@ cifs_put_super(struct super_block *sb)
        int rc = 0;
        struct cifs_sb_info *cifs_sb;
-        cFYI(1, ("In cifs_put_super"));
+        cFYI(1, "In cifs_put_super");
        cifs_sb = CIFS_SB(sb);
        if (cifs_sb == NULL) {
-                cFYI(1, ("Empty cifs superblock info passed to unmount"));
+                cFYI(1, "Empty cifs superblock info passed to unmount");
                return;
        }
@@ -205,7 +204,7 @@ cifs_put_super(struct super_block *sb)
        rc = cifs_umount(sb, cifs_sb);
        if (rc)
-                cERROR(1, ("cifs_umount failed with return code %d", rc));
+                cERROR(1, "cifs_umount failed with return code %d", rc);
 #ifdef CONFIG_CIFS_DFS_UPCALL
        if (cifs_sb->mountdata) {
                kfree(cifs_sb->mountdata);
@@ -214,6 +213,7 @@ cifs_put_super(struct super_block *sb)
 #endif
        unload_nls(cifs_sb->local_nls);
+        bdi_destroy(&cifs_sb->bdi);
        kfree(cifs_sb);
        unlock_kernel();
@@ -290,7 +290,6 @@ static int cifs_permission(struct inode *inode, int mask)
 static struct kmem_cache *cifs_inode_cachep;
 static struct kmem_cache *cifs_req_cachep;
 static struct kmem_cache *cifs_mid_cachep;
-struct kmem_cache *cifs_oplock_cachep;
 static struct kmem_cache *cifs_sm_req_cachep;
 mempool_t *cifs_sm_req_poolp;
 mempool_t *cifs_req_poolp;
@@ -312,6 +311,7 @@ cifs_alloc_inode(struct super_block *sb)
        cifs_inode->clientCanCacheRead = false;
        cifs_inode->clientCanCacheAll = false;
        cifs_inode->delete_pending = false;
+        cifs_inode->invalid_mapping = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
        cifs_inode->server_eof = 0;
@@ -421,106 +421,6 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
        return 0;
 }
-#ifdef CONFIG_CIFS_QUOTA
-int cifs_xquota_set(struct super_block *sb, int quota_type, qid_t qid,
-                struct fs_disk_quota *pdquota)
-{
-        int xid;
-        int rc = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *pTcon;
-        if (cifs_sb)
-                pTcon = cifs_sb->tcon;
-        else
-                return -EIO;
-        xid = GetXid();
-        if (pTcon) {
-                cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
-        } else
-                rc = -EIO;
-        FreeXid(xid);
-        return rc;
-}
-int cifs_xquota_get(struct super_block *sb, int quota_type, qid_t qid,
-                    struct fs_disk_quota *pdquota)
-{
-        int xid;
-        int rc = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *pTcon;
-        if (cifs_sb)
-                pTcon = cifs_sb->tcon;
-        else
-                return -EIO;
-        xid = GetXid();
-        if (pTcon) {
-                cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
-        } else
-                rc = -EIO;
-        FreeXid(xid);
-        return rc;
-}
-int cifs_xstate_set(struct super_block *sb, unsigned int flags, int operation)
-{
-        int xid;
-        int rc = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *pTcon;
-        if (cifs_sb)
-                pTcon = cifs_sb->tcon;
-        else
-                return -EIO;
-        xid = GetXid();
-        if (pTcon) {
-                cFYI(1, ("flags: 0x%x operation: 0x%x", flags, operation));
-        } else
-                rc = -EIO;
-        FreeXid(xid);
-        return rc;
-}
-int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
-{
-        int xid;
-        int rc = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *pTcon;
-        if (cifs_sb)
-                pTcon = cifs_sb->tcon;
-        else
-                return -EIO;
-        xid = GetXid();
-        if (pTcon) {
-                cFYI(1, ("pqstats %p", qstats));
-        } else
-                rc = -EIO;
-        FreeXid(xid);
-        return rc;
-}
-static const struct quotactl_ops cifs_quotactl_ops = {
-        .set_xquota     = cifs_xquota_set,
-        .get_xquota     = cifs_xquota_get,
-        .set_xstate     = cifs_xstate_set,
-        .get_xstate     = cifs_xstate_get,
-};
-#endif
 static void cifs_umount_begin(struct super_block *sb)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -547,7 +447,7 @@ static void cifs_umount_begin(struct super_block *sb)
        /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
        /* cancel_notify_requests(tcon); */
        if (tcon->ses && tcon->ses->server) {
-                cFYI(1, ("wake up tasks now - umount begin not complete"));
+                cFYI(1, "wake up tasks now - umount begin not complete");
                wake_up_all(&tcon->ses->server->request_q);
                wake_up_all(&tcon->ses->server->response_q);
                msleep(1); /* yield */
@@ -598,7 +498,7 @@ cifs_get_sb(struct file_system_type *fs_type,
        int rc;
        struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
-        cFYI(1, ("Devname: %s flags: %d ", dev_name, flags));
+        cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
        if (IS_ERR(sb))
                return PTR_ERR(sb);
@@ -638,14 +538,13 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
                   setting the revalidate time to zero */
                CIFS_I(file->f_path.dentry->d_inode)->time = 0;
-                retval = cifs_revalidate(file->f_path.dentry);
+                retval = cifs_revalidate_file(file);
                if (retval < 0)
                        return (loff_t)retval;
        }
        return generic_file_llseek_unlocked(file, offset, origin);
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
        /* note that this is called by vfs setlease with the BKL held
@@ -674,7 +573,6 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
        else
                return -EAGAIN;
 }
-#endif
 struct file_system_type cifs_fs_type = {
        .owner = THIS_MODULE,
@@ -751,10 +649,7 @@ const struct file_operations cifs_file_ops = {
 #ifdef CONFIG_CIFS_POSIX
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
        .setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_file_direct_ops = {
@@ -773,9 +668,7 @@ const struct file_operations cifs_file_direct_ops = {
        .unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
        .setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_file_nobrl_ops = {
        .read = do_sync_read,
@@ -792,10 +685,7 @@ const struct file_operations cifs_file_nobrl_ops = {
 #ifdef CONFIG_CIFS_POSIX
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
        .setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_file_direct_nobrl_ops = {
@@ -807,14 +697,13 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
        .release = cifs_close,
        .fsync = cifs_fsync,
        .flush = cifs_flush,
+        .mmap = cifs_file_mmap,
        .splice_read = generic_file_splice_read,
 #ifdef CONFIG_CIFS_POSIX
        .unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
        .setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_dir_ops = {
@@ -866,7 +755,7 @@ cifs_init_request_bufs(void)
        } else {
                CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
        }
-/*      cERROR(1,("CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize)); */
+/*      cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
        cifs_req_cachep = kmem_cache_create("cifs_request",
                                            CIFSMaxBufSize +
                                            MAX_CIFS_HDR_SIZE, 0,
@@ -878,7 +767,7 @@ cifs_init_request_bufs(void)
                cifs_min_rcv = 1;
        else if (cifs_min_rcv > 64) {
                cifs_min_rcv = 64;
-                cERROR(1, ("cifs_min_rcv set to maximum (64)"));
+                cERROR(1, "cifs_min_rcv set to maximum (64)");
        }
        cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
@@ -909,7 +798,7 @@ cifs_init_request_bufs(void)
                cifs_min_small = 2;
        else if (cifs_min_small > 256) {
                cifs_min_small = 256;
-                cFYI(1, ("cifs_min_small set to maximum (256)"));
+                cFYI(1, "cifs_min_small set to maximum (256)");
        }
        cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
@@ -950,15 +839,6 @@ cifs_init_mids(void)
                return -ENOMEM;
        }
-        cifs_oplock_cachep = kmem_cache_create("cifs_oplock_structs",
-                                        sizeof(struct oplock_q_entry), 0,
-                                        SLAB_HWCACHE_ALIGN, NULL);
-        if (cifs_oplock_cachep == NULL) {
-                mempool_destroy(cifs_mid_poolp);
-                kmem_cache_destroy(cifs_mid_cachep);
-                return -ENOMEM;
-        }
        return 0;
 }
@@ -967,7 +847,6 @@ cifs_destroy_mids(void)
 {
        mempool_destroy(cifs_mid_poolp);
        kmem_cache_destroy(cifs_mid_cachep);
-        kmem_cache_destroy(cifs_oplock_cachep);
 }
 static int __init
@@ -1007,10 +886,10 @@ init_cifs(void)
        if (cifs_max_pending < 2) {
                cifs_max_pending = 2;
-                cFYI(1, ("cifs_max_pending set to min of 2"));
+                cFYI(1, "cifs_max_pending set to min of 2");
        } else if (cifs_max_pending > 256) {
                cifs_max_pending = 256;
-                cFYI(1, ("cifs_max_pending set to max of 256"));
+                cFYI(1, "cifs_max_pending set to max of 256");
        }
        rc = cifs_init_inodecache();
@@ -1068,7 +947,7 @@ init_cifs(void)
 static void __exit
 exit_cifs(void)
 {
-        cFYI(DBG2, ("exit_cifs"));
+        cFYI(DBG2, "exit_cifs");
        cifs_proc_clean();
 #ifdef CONFIG_CIFS_DFS_UPCALL
        cifs_dfs_release_automount_timer();
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 78c1b86d55f6..0242ff9cbf41 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,7 +61,8 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
 extern int cifs_rmdir(struct inode *, struct dentry *);
 extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
                       struct dentry *);
-extern int cifs_revalidate(struct dentry *);
+extern int cifs_revalidate_file(struct file *filp);
+extern int cifs_revalidate_dentry(struct dentry *);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
@@ -113,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.62"
+#define CIFS_VERSION   "1.64"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a1c817eb291a..a88479ceaad5 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
 */
 #include <linux/in.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <linux/slow-work.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
@@ -86,7 +87,6 @@ enum securityEnum {
        RawNTLMSSP,             /* NTLMSSP without SPNEGO, NTLMv2 hash */
 /*      NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
        Kerberos,               /* Kerberos via SPNEGO */
-        MSKerberos,             /* MS Kerberos via SPNEGO */
 };
 enum protocolEnum {
@@ -184,6 +184,12 @@ struct TCP_Server_Info {
        struct mac_key mac_signing_key;
        char ntlmv2_hash[16];
        unsigned long lstrp; /* when we got last response from this server */
+        u16 dialect; /* dialect index that server chose */
+        /* extended security flavors that server supports */
+        bool    sec_kerberos;           /* supports plain Kerberos */
+        bool    sec_mskerberos;         /* supports legacy MS Kerberos */
+        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
+        bool    sec_ntlmssp;            /* supports NTLMSSP */
 };
 /*
@@ -389,6 +395,7 @@ struct cifsInodeInfo {
        bool clientCanCacheRead:1;      /* read oplock */
        bool clientCanCacheAll:1;       /* read and writebehind oplock */
        bool delete_pending:1;          /* DELETE_ON_CLOSE is set */
+        bool invalid_mapping:1;         /* pagecache is invalid */
        u64  server_eof;                /* current file size on server */
        u64  uniqueid;                  /* server inode number */
        struct inode vfs_inode;
@@ -500,6 +507,7 @@ struct dfs_info3_param {
 #define CIFS_FATTR_DFS_REFERRAL         0x1
 #define CIFS_FATTR_DELETE_PENDING       0x2
 #define CIFS_FATTR_NEED_REVAL           0x4
+#define CIFS_FATTR_INO_COLLISION        0x8
 struct cifs_fattr {
        u32             cf_flags;
@@ -715,7 +723,7 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
 GLOBAL_EXTERN unsigned int oplockEnabled;
 GLOBAL_EXTERN unsigned int experimEnabled;
 GLOBAL_EXTERN unsigned int lookupCacheEnabled;
-GLOBAL_EXTERN unsigned int extended_security;   /* if on, session setup sent
+GLOBAL_EXTERN unsigned int global_secflags;     /* if on, session setup sent
                                with more secure ntlmssp2 challenge/resp */
 GLOBAL_EXTERN unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
 GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 88e2bc44ac58..fb1657e0fdb8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -39,8 +39,20 @@ extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
                        unsigned int /* length */);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
-#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
+#define GetXid()                                                \
-#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
+({                                                              \
+        int __xid = (int)_GetXid();                             \
+        cFYI(1, "CIFS VFS: in %s as Xid: %d with uid: %d",      \
+             __func__, __xid, current_fsuid());                 \
+        __xid;                                                  \
+})
+#define FreeXid(curr_xid)                                       \
+do {                                                            \
+        _FreeXid(curr_xid);                                     \
+        cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d",      \
+             __func__, curr_xid, (int)rc);                      \
+} while (0)
 extern char *build_path_from_dentry(struct dentry *);
 extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
@@ -73,7 +85,7 @@ extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
-                        enum securityEnum *secType);
+                        struct TCP_Server_Info *server);
 extern int cifs_convert_address(char *src, void *dst);
 extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
@@ -83,7 +95,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
                                struct cifsSesInfo *ses,
                                void **request_buf);
 extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
-                             const int stage,
                             const struct nls_table *nls_cp);
 extern __u16 GetNextMid(struct TCP_Server_Info *server);
 extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -95,8 +106,11 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
                                __u16 fileHandle, struct file *file,
                                struct vfsmount *mnt, unsigned int oflags);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
-                           struct vfsmount *mnt, int mode, int oflags,
+                                struct vfsmount *mnt,
-                           __u32 *poplock, __u16 *pnetfid, int xid);
+                                struct super_block *sb,
+                                int mode, int oflags,
+                                __u32 *poplock, __u16 *pnetfid, int xid);
+void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
 extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
                                     FILE_UNIX_BASIC_INFO *info,
                                     struct cifs_sb_info *cifs_sb);
@@ -104,10 +118,12 @@ extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
 extern struct inode *cifs_iget(struct super_block *sb,
                               struct cifs_fattr *fattr);
+extern int cifs_get_file_info(struct file *filp);
 extern int cifs_get_inode_info(struct inode **pinode,
                        const unsigned char *search_path,
                        FILE_ALL_INFO *pfile_info,
                        struct super_block *sb, int xid, const __u16 *pfid);
+extern int cifs_get_file_info_unix(struct file *filp);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, int xid);
@@ -123,7 +139,9 @@ extern void cifs_dfs_release_automount_timer(void);
 void cifs_proc_init(void);
 void cifs_proc_clean(void);
-extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
+extern int cifs_negotiate_protocol(unsigned int xid,
+                                  struct cifsSesInfo *ses);
+extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
                        struct nls_table *nls_info);
 extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses);
@@ -142,6 +160,8 @@ extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
                        const __u16 search_handle);
+extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+                        u16 netfid, FILE_ALL_INFO *pFindData);
 extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
                        const unsigned char *searchName,
                        FILE_ALL_INFO *findData,
@@ -152,6 +172,8 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
                        FILE_ALL_INFO *findData,
                        const struct nls_table *nls_codepage, int remap);
+extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+                        u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
 extern int CIFSSMBUnixQPathInfo(const int xid,
                        struct cifsTconInfo *tcon,
                        const unsigned char *searchName,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 9d17df3e0768..c65c3419dd37 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/cifssmb.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2009
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   Contains the routines for constructing the SMB PDUs themselves
@@ -30,6 +30,7 @@
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/vfs.h>
+#include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
 #include <asm/uaccess.h>
 #include "cifspdu.h"
@@ -129,8 +130,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
                if (smb_command != SMB_COM_WRITE_ANDX &&
                    smb_command != SMB_COM_OPEN_ANDX &&
                    smb_command != SMB_COM_TREE_DISCONNECT) {
-                        cFYI(1, ("can not send cmd %d while umounting",
+                        cFYI(1, "can not send cmd %d while umounting",
-                                smb_command));
+                                smb_command);
                        return -ENODEV;
                }
        }
@@ -156,7 +157,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
                 * back on-line
                 */
                if (!tcon->retry || ses->status == CifsExiting) {
-                        cFYI(1, ("gave up waiting on reconnect in smb_init"));
+                        cFYI(1, "gave up waiting on reconnect in smb_init");
                        return -EHOSTDOWN;
                }
        }
@@ -171,7 +172,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
         * reconnect the same SMB session
         */
        mutex_lock(&ses->session_mutex);
-        if (ses->need_reconnect)
+        rc = cifs_negotiate_protocol(0, ses);
+        if (rc == 0 && ses->need_reconnect)
                rc = cifs_setup_session(0, ses, nls_codepage);
        /* do we need to reconnect tcon? */
@@ -183,7 +185,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
        mark_open_files_invalid(tcon);
        rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
        mutex_unlock(&ses->session_mutex);
-        cFYI(1, ("reconnect tcon rc = %d", rc));
+        cFYI(1, "reconnect tcon rc = %d", rc);
        if (rc)
                goto out;
@@ -354,7 +356,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        struct TCP_Server_Info *server;
        u16 count;
        unsigned int secFlags;
-        u16 dialect;
        if (ses->server)
                server = ses->server;
@@ -371,9 +372,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
                secFlags = ses->overrideSecFlg;  /* BB FIXME fix sign flags? */
        else /* if override flags set only sign/seal OR them with global auth */
-                secFlags = extended_security | ses->overrideSecFlg;
+                secFlags = global_secflags | ses->overrideSecFlg;
-        cFYI(1, ("secFlags 0x%x", secFlags));
+        cFYI(1, "secFlags 0x%x", secFlags);
        pSMB->hdr.Mid = GetNextMid(server);
        pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
@@ -381,14 +382,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
-                cFYI(1, ("Kerberos only mechanism, enable extended security"));
+                cFYI(1, "Kerberos only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        }
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
-                cFYI(1, ("NTLMSSP only mechanism, enable extended security"));
+                cFYI(1, "NTLMSSP only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        }
 #endif
@@ -407,10 +408,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        if (rc != 0)
                goto neg_err_exit;
-        dialect = le16_to_cpu(pSMBr->DialectIndex);
+        server->dialect = le16_to_cpu(pSMBr->DialectIndex);
-        cFYI(1, ("Dialect: %d", dialect));
+        cFYI(1, "Dialect: %d", server->dialect);
        /* Check wct = 1 error case */
-        if ((pSMBr->hdr.WordCount < 13) || (dialect == BAD_PROT)) {
+        if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
                /* core returns wct = 1, but we do not ask for core - otherwise
                small wct just comes when dialect index is -1 indicating we
                could not negotiate a common dialect */
@@ -418,8 +419,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                goto neg_err_exit;
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
        } else if ((pSMBr->hdr.WordCount == 13)
-                        && ((dialect == LANMAN_PROT)
+                        && ((server->dialect == LANMAN_PROT)
-                                || (dialect == LANMAN2_PROT))) {
+                                || (server->dialect == LANMAN2_PROT))) {
                __s16 tmp;
                struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
@@ -427,8 +428,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        (secFlags & CIFSSEC_MAY_PLNTXT))
                        server->secType = LANMAN;
                else {
-                        cERROR(1, ("mount failed weak security disabled"
+                        cERROR(1, "mount failed weak security disabled"
-                                   " in /proc/fs/cifs/SecurityFlags"));
+                                   " in /proc/fs/cifs/SecurityFlags");
                        rc = -EOPNOTSUPP;
                        goto neg_err_exit;
                }
@@ -461,9 +462,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        utc = CURRENT_TIME;
                        ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
                                            rsp->SrvTime.Time, 0);
-                        cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
+                        cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d",
                                (int)ts.tv_sec, (int)utc.tv_sec,
-                                (int)(utc.tv_sec - ts.tv_sec)));
+                                (int)(utc.tv_sec - ts.tv_sec));
                        val = (int)(utc.tv_sec - ts.tv_sec);
                        seconds = abs(val);
                        result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
@@ -477,7 +478,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        server->timeAdj = (int)tmp;
                        server->timeAdj *= 60; /* also in seconds */
                }
-                cFYI(1, ("server->timeAdj: %d seconds", server->timeAdj));
+                cFYI(1, "server->timeAdj: %d seconds", server->timeAdj);
                /* BB get server time for time conversions and add
@@ -492,15 +493,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        goto neg_err_exit;
                }
-                cFYI(1, ("LANMAN negotiated"));
+                cFYI(1, "LANMAN negotiated");
                /* we will not end up setting signing flags - as no signing
                was in LANMAN and server did not return the flags on */
                goto signing_check;
 #else /* weak security disabled */
        } else if (pSMBr->hdr.WordCount == 13) {
-                cERROR(1, ("mount failed, cifs module not built "
+                cERROR(1, "mount failed, cifs module not built "
-                          "with CIFS_WEAK_PW_HASH support"));
+                          "with CIFS_WEAK_PW_HASH support");
-                        rc = -EOPNOTSUPP;
+                rc = -EOPNOTSUPP;
 #endif /* WEAK_PW_HASH */
                goto neg_err_exit;
        } else if (pSMBr->hdr.WordCount != 17) {
@@ -511,14 +512,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        /* else wct == 17 NTLM */
        server->secMode = pSMBr->SecurityMode;
        if ((server->secMode & SECMODE_USER) == 0)
-                cFYI(1, ("share mode security"));
+                cFYI(1, "share mode security");
        if ((server->secMode & SECMODE_PW_ENCRYPT) == 0)
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
 #endif /* CIFS_WEAK_PW_HASH */
-                        cERROR(1, ("Server requests plain text password"
+                        cERROR(1, "Server requests plain text password"
-                                  " but client support disabled"));
+                                  " but client support disabled");
        if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
                server->secType = NTLMv2;
@@ -538,7 +539,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 #endif */
        else {
                rc = -EOPNOTSUPP;
-                cERROR(1, ("Invalid security type"));
+                cERROR(1, "Invalid security type");
                goto neg_err_exit;
        }
        /* else ... any others ...? */
@@ -550,7 +551,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
                        (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
        server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
-        cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
+        cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
        GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
        server->capabilities = le32_to_cpu(pSMBr->Capabilities);
        server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
@@ -581,7 +582,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        if (memcmp(server->server_GUID,
                                   pSMBr->u.extended_response.
                                   GUID, 16) != 0) {
-                                cFYI(1, ("server UID changed"));
+                                cFYI(1, "server UID changed");
                                memcpy(server->server_GUID,
                                        pSMBr->u.extended_response.GUID,
                                        16);
@@ -596,13 +597,19 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        server->secType = RawNTLMSSP;
                } else {
                        rc = decode_negTokenInit(pSMBr->u.extended_response.
-                                                 SecurityBlob,
+                                                 SecurityBlob, count - 16,
-                                                 count - 16,
+                                                 server);
-                                                 &server->secType);
                        if (rc == 1)
                                rc = 0;
                        else
                                rc = -EINVAL;
+                        if (server->sec_kerberos || server->sec_mskerberos)
+                                server->secType = Kerberos;
+                        else if (server->sec_ntlmssp)
+                                server->secType = RawNTLMSSP;
+                        else
+                                rc = -EOPNOTSUPP;
                }
        } else
                server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -613,22 +620,21 @@ signing_check:
        if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
                /* MUST_SIGN already includes the MAY_SIGN FLAG
                   so if this is zero it means that signing is disabled */
-                cFYI(1, ("Signing disabled"));
+                cFYI(1, "Signing disabled");
                if (server->secMode & SECMODE_SIGN_REQUIRED) {
-                        cERROR(1, ("Server requires "
+                        cERROR(1, "Server requires "
                                   "packet signing to be enabled in "
-                                   "/proc/fs/cifs/SecurityFlags."));
+                                   "/proc/fs/cifs/SecurityFlags.");
                        rc = -EOPNOTSUPP;
                }
                server->secMode &=
                        ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
        } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
                /* signing required */
-                cFYI(1, ("Must sign - secFlags 0x%x", secFlags));
+                cFYI(1, "Must sign - secFlags 0x%x", secFlags);
                if ((server->secMode &
                        (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
-                        cERROR(1,
+                        cERROR(1, "signing required but server lacks support");
-                                ("signing required but server lacks support"));
                        rc = -EOPNOTSUPP;
                } else
                        server->secMode |= SECMODE_SIGN_REQUIRED;
@@ -642,7 +648,7 @@ signing_check:
 neg_err_exit:
        cifs_buf_release(pSMB);
-        cFYI(1, ("negprot rc %d", rc));
+        cFYI(1, "negprot rc %d", rc);
        return rc;
 }
@@ -652,7 +658,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
        struct smb_hdr *smb_buffer;
        int rc = 0;
-        cFYI(1, ("In tree disconnect"));
+        cFYI(1, "In tree disconnect");
        /* BB: do we need to check this? These should never be NULL. */
        if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
@@ -674,7 +680,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
        rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
        if (rc)
-                cFYI(1, ("Tree disconnect failed %d", rc));
+                cFYI(1, "Tree disconnect failed %d", rc);
        /* No need to return error on this operation if tid invalidated and
           closed on server already e.g. due to tcp session crashing */
@@ -690,7 +696,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
        LOGOFF_ANDX_REQ *pSMB;
        int rc = 0;
-        cFYI(1, ("In SMBLogoff for session disconnect"));
+        cFYI(1, "In SMBLogoff for session disconnect");
        /*
         * BB: do we need to check validity of ses and server? They should
@@ -743,7 +749,7 @@ CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count;
-        cFYI(1, ("In POSIX delete"));
+        cFYI(1, "In POSIX delete");
 PsxDelete:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -795,7 +801,7 @@ PsxDelete:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("Posix delete returned %d", rc));
+                cFYI(1, "Posix delete returned %d", rc);
        cifs_buf_release(pSMB);
        cifs_stats_inc(&tcon->num_deletes);
@@ -842,7 +848,7 @@ DelFileRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_deletes);
        if (rc)
-                cFYI(1, ("Error in RMFile = %d", rc));
+                cFYI(1, "Error in RMFile = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -861,7 +867,7 @@ CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon, const char *dirName,
        int bytes_returned;
        int name_len;
-        cFYI(1, ("In CIFSSMBRmDir"));
+        cFYI(1, "In CIFSSMBRmDir");
 RmDirRetry:
        rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -886,7 +892,7 @@ RmDirRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_rmdirs);
        if (rc)
-                cFYI(1, ("Error in RMDir = %d", rc));
+                cFYI(1, "Error in RMDir = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -904,7 +910,7 @@ CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned;
        int name_len;
-        cFYI(1, ("In CIFSSMBMkDir"));
+        cFYI(1, "In CIFSSMBMkDir");
 MkDirRetry:
        rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -929,7 +935,7 @@ MkDirRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_mkdirs);
        if (rc)
-                cFYI(1, ("Error in Mkdir = %d", rc));
+                cFYI(1, "Error in Mkdir = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -952,7 +958,7 @@ CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
        OPEN_PSX_REQ *pdata;
        OPEN_PSX_RSP *psx_rsp;
-        cFYI(1, ("In POSIX Create"));
+        cFYI(1, "In POSIX Create");
 PsxCreat:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -1006,11 +1012,11 @@ PsxCreat:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Posix create returned %d", rc));
+                cFYI(1, "Posix create returned %d", rc);
                goto psx_create_err;
        }
-        cFYI(1, ("copying inode info"));
+        cFYI(1, "copying inode info");
        rc = validate_t2((struct smb_t2_rsp *)pSMBr);
        if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
@@ -1032,11 +1038,11 @@ PsxCreat:
        /* check to make sure response data is there */
        if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
                pRetData->Type = cpu_to_le32(-1); /* unknown */
-                cFYI(DBG2, ("unknown type"));
+                cFYI(DBG2, "unknown type");
        } else {
                if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
                                        + sizeof(FILE_UNIX_BASIC_INFO)) {
-                        cERROR(1, ("Open response data too small"));
+                        cERROR(1, "Open response data too small");
                        pRetData->Type = cpu_to_le32(-1);
                        goto psx_create_err;
                }
@@ -1083,7 +1089,7 @@ static __u16 convert_disposition(int disposition)
                        ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
                        break;
                default:
-                        cFYI(1, ("unknown disposition %d", disposition));
+                        cFYI(1, "unknown disposition %d", disposition);
                        ofun =  SMBOPEN_OAPPEND; /* regular open */
        }
        return ofun;
@@ -1174,7 +1180,7 @@ OldOpenRetry:
                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
-                cFYI(1, ("Error in Open = %d", rc));
+                cFYI(1, "Error in Open = %d", rc);
        } else {
        /* BB verify if wct == 15 */
@@ -1287,7 +1293,7 @@ openRetry:
                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
-                cFYI(1, ("Error in Open = %d", rc));
+                cFYI(1, "Error in Open = %d", rc);
        } else {
                *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
                *netfid = pSMBr->Fid;   /* cifs fid stays in le */
@@ -1325,7 +1331,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        int resp_buf_type = 0;
        struct kvec iov[1];
-        cFYI(1, ("Reading %d bytes on fid %d", count, netfid));
+        cFYI(1, "Reading %d bytes on fid %d", count, netfid);
        if (tcon->ses->capabilities & CAP_LARGE_FILES)
                wct = 12;
        else {
@@ -1370,7 +1376,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        cifs_stats_inc(&tcon->num_reads);
        pSMBr = (READ_RSP *)iov[0].iov_base;
        if (rc) {
-                cERROR(1, ("Send error in read = %d", rc));
+                cERROR(1, "Send error in read = %d", rc);
        } else {
                int data_length = le16_to_cpu(pSMBr->DataLengthHigh);
                data_length = data_length << 16;
@@ -1380,15 +1386,15 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
                /*check that DataLength would not go beyond end of SMB */
                if ((data_length > CIFSMaxBufSize)
                                || (data_length > count)) {
-                        cFYI(1, ("bad length %d for count %d",
+                        cFYI(1, "bad length %d for count %d",
-                                 data_length, count));
+                                 data_length, count);
                        rc = -EIO;
                        *nbytes = 0;
                } else {
                        pReadData = (char *) (&pSMBr->hdr.Protocol) +
                                        le16_to_cpu(pSMBr->DataOffset);
 /*                      if (rc = copy_to_user(buf, pReadData, data_length)) {
-                                cERROR(1,("Faulting on read rc = %d",rc));
+                                cERROR(1, "Faulting on read rc = %d",rc);
                                rc = -EFAULT;
                        }*/ /* can not use copy_to_user when using page cache*/
                        if (*buf)
@@ -1430,7 +1436,9 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
        __u32 bytes_sent;
        __u16 byte_count;
-        /* cFYI(1, ("write at %lld %d bytes", offset, count));*/
+        *nbytes = 0;
+        /* cFYI(1, "write at %lld %d bytes", offset, count);*/
        if (tcon->ses == NULL)
                return -ECONNABORTED;
@@ -1511,12 +1519,19 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
                         (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
        cifs_stats_inc(&tcon->num_writes);
        if (rc) {
-                cFYI(1, ("Send error in write = %d", rc));
+                cFYI(1, "Send error in write = %d", rc);
-                *nbytes = 0;
        } else {
                *nbytes = le16_to_cpu(pSMBr->CountHigh);
                *nbytes = (*nbytes) << 16;
                *nbytes += le16_to_cpu(pSMBr->Count);
+                /*
+                 * Mask off high 16 bits when bytes written as returned by the
+                 * server is greater than bytes requested by the client. Some
+                 * OS/2 servers are known to set incorrect CountHigh values.
+                 */
+                if (*nbytes > count)
+                        *nbytes &= 0xFFFF;
        }
        cifs_buf_release(pSMB);
@@ -1541,7 +1556,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
        *nbytes = 0;
-        cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count));
+        cFYI(1, "write2 at %lld %d bytes", (long long)offset, count);
        if (tcon->ses->capabilities & CAP_LARGE_FILES) {
                wct = 14;
@@ -1596,7 +1611,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
                          long_op);
        cifs_stats_inc(&tcon->num_writes);
        if (rc) {
-                cFYI(1, ("Send error Write2 = %d", rc));
+                cFYI(1, "Send error Write2 = %d", rc);
        } else if (resp_buf_type == 0) {
                /* presumably this can not happen, but best to be safe */
                rc = -EIO;
@@ -1605,6 +1620,14 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
                *nbytes = le16_to_cpu(pSMBr->CountHigh);
                *nbytes = (*nbytes) << 16;
                *nbytes += le16_to_cpu(pSMBr->Count);
+                /*
+                 * Mask off high 16 bits when bytes written as returned by the
+                 * server is greater than bytes requested by the client. OS/2
+                 * servers are known to set incorrect CountHigh values.
+                 */
+                if (*nbytes > count)
+                        *nbytes &= 0xFFFF;
        }
 /*      cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
@@ -1633,7 +1656,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        int timeout = 0;
        __u16 count;
-        cFYI(1, ("CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock));
+        cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock);
        rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
        if (rc)
@@ -1681,7 +1704,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        }
        cifs_stats_inc(&tcon->num_locks);
        if (rc)
-                cFYI(1, ("Send error in Lock = %d", rc));
+                cFYI(1, "Send error in Lock = %d", rc);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
        since file handle passed in no longer valid */
@@ -1704,7 +1727,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
        __u16 params, param_offset, offset, byte_count, count;
        struct kvec iov[1];
-        cFYI(1, ("Posix Lock"));
+        cFYI(1, "Posix Lock");
        if (pLockData == NULL)
                return -EINVAL;
@@ -1774,7 +1797,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
        }
        if (rc) {
-                cFYI(1, ("Send error in Posix Lock = %d", rc));
+                cFYI(1, "Send error in Posix Lock = %d", rc);
        } else if (get_flag) {
                /* lock structure can be returned on get */
                __u16 data_offset;
@@ -1793,8 +1816,21 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
                }
                parm_data = (struct cifs_posix_lock *)
                        ((char *)&pSMBr->hdr.Protocol + data_offset);
-                if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
+                if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
                        pLockData->fl_type = F_UNLCK;
+                else {
+                        if (parm_data->lock_type ==
+                                        __constant_cpu_to_le16(CIFS_RDLCK))
+                                pLockData->fl_type = F_RDLCK;
+                        else if (parm_data->lock_type ==
+                                        __constant_cpu_to_le16(CIFS_WRLCK))
+                                pLockData->fl_type = F_WRLCK;
+                        pLockData->fl_start = parm_data->start;
+                        pLockData->fl_end = parm_data->start +
+                                                parm_data->length - 1;
+                        pLockData->fl_pid = parm_data->pid;
+                }
        }
 plk_err_exit:
@@ -1818,7 +1854,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 {
        int rc = 0;
        CLOSE_REQ *pSMB = NULL;
-        cFYI(1, ("In CIFSSMBClose"));
+        cFYI(1, "In CIFSSMBClose");
 /* do not retry on dead session on close */
        rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB);
@@ -1835,7 +1871,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
        if (rc) {
                if (rc != -EINTR) {
                        /* EINTR is expected when user ctl-c to kill app */
-                        cERROR(1, ("Send error in Close = %d", rc));
+                        cERROR(1, "Send error in Close = %d", rc);
                }
        }
@@ -1851,7 +1887,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 {
        int rc = 0;
        FLUSH_REQ *pSMB = NULL;
-        cFYI(1, ("In CIFSSMBFlush"));
+        cFYI(1, "In CIFSSMBFlush");
        rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
        if (rc)
@@ -1862,7 +1898,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        cifs_stats_inc(&tcon->num_flushes);
        if (rc)
-                cERROR(1, ("Send error in Flush = %d", rc));
+                cERROR(1, "Send error in Flush = %d", rc);
        return rc;
 }
@@ -1879,7 +1915,7 @@ CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
        int name_len, name_len2;
        __u16 count;
-        cFYI(1, ("In CIFSSMBRename"));
+        cFYI(1, "In CIFSSMBRename");
 renameRetry:
        rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -1925,7 +1961,7 @@ renameRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_renames);
        if (rc)
-                cFYI(1, ("Send error in rename = %d", rc));
+                cFYI(1, "Send error in rename = %d", rc);
        cifs_buf_release(pSMB);
@@ -1949,7 +1985,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
        int len_of_str;
        __u16 params, param_offset, offset, count, byte_count;
-        cFYI(1, ("Rename to File by handle"));
+        cFYI(1, "Rename to File by handle");
        rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB,
                        (void **) &pSMBr);
        if (rc)
@@ -2004,7 +2040,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&pTcon->num_t2renames);
        if (rc)
-                cFYI(1, ("Send error in Rename (by file handle) = %d", rc));
+                cFYI(1, "Send error in Rename (by file handle) = %d", rc);
        cifs_buf_release(pSMB);
@@ -2026,7 +2062,7 @@ CIFSSMBCopy(const int xid, struct cifsTconInfo *tcon, const char *fromName,
        int name_len, name_len2;
        __u16 count;
-        cFYI(1, ("In CIFSSMBCopy"));
+        cFYI(1, "In CIFSSMBCopy");
 copyRetry:
        rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB,
                        (void **) &pSMBr);
@@ -2071,8 +2107,8 @@ copyRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in copy = %d with %d files copied",
+                cFYI(1, "Send error in copy = %d with %d files copied",
-                        rc, le16_to_cpu(pSMBr->CopyCount)));
+                        rc, le16_to_cpu(pSMBr->CopyCount));
        }
        cifs_buf_release(pSMB);
@@ -2096,7 +2132,7 @@ CIFSUnixCreateSymLink(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count;
-        cFYI(1, ("In Symlink Unix style"));
+        cFYI(1, "In Symlink Unix style");
 createSymLinkRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -2161,7 +2197,7 @@ createSymLinkRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_symlinks);
        if (rc)
-                cFYI(1, ("Send error in SetPathInfo create symlink = %d", rc));
+                cFYI(1, "Send error in SetPathInfo create symlink = %d", rc);
        cifs_buf_release(pSMB);
@@ -2185,7 +2221,7 @@ CIFSUnixCreateHardLink(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count;
-        cFYI(1, ("In Create Hard link Unix style"));
+        cFYI(1, "In Create Hard link Unix style");
 createHardLinkRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -2247,7 +2283,7 @@ createHardLinkRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_hardlinks);
        if (rc)
-                cFYI(1, ("Send error in SetPathInfo (hard link) = %d", rc));
+                cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -2268,7 +2304,7 @@ CIFSCreateHardLink(const int xid, struct cifsTconInfo *tcon,
        int name_len, name_len2;
        __u16 count;
-        cFYI(1, ("In CIFSCreateHardLink"));
+        cFYI(1, "In CIFSCreateHardLink");
 winCreateHardLinkRetry:
        rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB,
@@ -2319,7 +2355,7 @@ winCreateHardLinkRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_hardlinks);
        if (rc)
-                cFYI(1, ("Send error in hard link (NT rename) = %d", rc));
+                cFYI(1, "Send error in hard link (NT rename) = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -2342,7 +2378,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
        __u16 params, byte_count;
        char *data_start;
-        cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName));
+        cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName);
 querySymLinkRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2389,7 +2425,7 @@ querySymLinkRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QuerySymLinkInfo = %d", rc));
+                cFYI(1, "Send error in QuerySymLinkInfo = %d", rc);
        } else {
                /* decode response */
@@ -2490,21 +2526,21 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
        /* should we also check that parm and data areas do not overlap? */
        if (*ppparm > end_of_smb) {
-                cFYI(1, ("parms start after end of smb"));
+                cFYI(1, "parms start after end of smb");
                return -EINVAL;
        } else if (parm_count + *ppparm > end_of_smb) {
-                cFYI(1, ("parm end after end of smb"));
+                cFYI(1, "parm end after end of smb");
                return -EINVAL;
        } else if (*ppdata > end_of_smb) {
-                cFYI(1, ("data starts after end of smb"));
+                cFYI(1, "data starts after end of smb");
                return -EINVAL;
        } else if (data_count + *ppdata > end_of_smb) {
-                cFYI(1, ("data %p + count %d (%p) ends after end of smb %p start %p",
+                cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
                        *ppdata, data_count, (data_count + *ppdata),
-                        end_of_smb, pSMBr));
+                        end_of_smb, pSMBr);
                return -EINVAL;
        } else if (parm_count + data_count > pSMBr->ByteCount) {
-                cFYI(1, ("parm count and data count larger than SMB"));
+                cFYI(1, "parm count and data count larger than SMB");
                return -EINVAL;
        }
        *pdatalen = data_count;
@@ -2523,7 +2559,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
        struct smb_com_transaction_ioctl_req *pSMB;
        struct smb_com_transaction_ioctl_rsp *pSMBr;
-        cFYI(1, ("In Windows reparse style QueryLink for path %s", searchName));
+        cFYI(1, "In Windows reparse style QueryLink for path %s", searchName);
        rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
        if (rc)
@@ -2552,7 +2588,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QueryReparseLinkInfo = %d", rc));
+                cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc);
        } else {                /* decode response */
                __u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
                __u32 data_count = le32_to_cpu(pSMBr->DataCount);
@@ -2576,7 +2612,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
                        if ((reparse_buf->LinkNamesBuf +
                                reparse_buf->TargetNameOffset +
                                reparse_buf->TargetNameLen) > end_of_smb) {
-                                cFYI(1, ("reparse buf beyond SMB"));
+                                cFYI(1, "reparse buf beyond SMB");
                                rc = -EIO;
                                goto qreparse_out;
                        }
@@ -2597,12 +2633,12 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
                        }
                } else {
                        rc = -EIO;
-                        cFYI(1, ("Invalid return data count on "
+                        cFYI(1, "Invalid return data count on "
-                                 "get reparse info ioctl"));
+                                 "get reparse info ioctl");
                }
                symlinkinfo[buflen] = 0; /* just in case so the caller
                                        does not go off the end of the buffer */
-                cFYI(1, ("readlink result - %s", symlinkinfo));
+                cFYI(1, "readlink result - %s", symlinkinfo);
        }
 qreparse_out:
@@ -2625,7 +2661,7 @@ static void cifs_convert_ace(posix_acl_xattr_entry *ace,
        ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
        ace->e_tag  = cpu_to_le16(cifs_ace->cifs_e_tag);
        ace->e_id   = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
-        /* cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id)); */
+        /* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */
        return;
 }
@@ -2651,8 +2687,8 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
                size += sizeof(struct cifs_posix_ace) * count;
                /* check if we would go beyond end of SMB */
                if (size_of_data_area < size) {
-                        cFYI(1, ("bad CIFS POSIX ACL size %d vs. %d",
+                        cFYI(1, "bad CIFS POSIX ACL size %d vs. %d",
-                                size_of_data_area, size));
+                                size_of_data_area, size);
                        return -EINVAL;
                }
        } else if (acl_type & ACL_TYPE_DEFAULT) {
@@ -2699,7 +2735,7 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
                cifs_ace->cifs_uid = cpu_to_le64(-1);
        } else
                cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
-        /*cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id));*/
+        /*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/
        return rc;
 }
@@ -2717,12 +2753,12 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
                return 0;
        count = posix_acl_xattr_count((size_t)buflen);
-        cFYI(1, ("setting acl with %d entries from buf of length %d and "
+        cFYI(1, "setting acl with %d entries from buf of length %d and "
                "version of %d",
-                count, buflen, le32_to_cpu(local_acl->a_version)));
+                count, buflen, le32_to_cpu(local_acl->a_version));
        if (le32_to_cpu(local_acl->a_version) != 2) {
-                cFYI(1, ("unknown POSIX ACL version %d",
+                cFYI(1, "unknown POSIX ACL version %d",
-                     le32_to_cpu(local_acl->a_version)));
+                     le32_to_cpu(local_acl->a_version));
                return 0;
        }
        cifs_acl->version = cpu_to_le16(1);
@@ -2731,7 +2767,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
        else if (acl_type == ACL_TYPE_DEFAULT)
                cifs_acl->default_entry_count = cpu_to_le16(count);
        else {
-                cFYI(1, ("unknown ACL type %d", acl_type));
+                cFYI(1, "unknown ACL type %d", acl_type);
                return 0;
        }
        for (i = 0; i < count; i++) {
@@ -2764,7 +2800,7 @@ CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
        int name_len;
        __u16 params, byte_count;
-        cFYI(1, ("In GetPosixACL (Unix) for path %s", searchName));
+        cFYI(1, "In GetPosixACL (Unix) for path %s", searchName);
 queryAclRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2816,7 +2852,7 @@ queryAclRetry:
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_acl_get);
        if (rc) {
-                cFYI(1, ("Send error in Query POSIX ACL = %d", rc));
+                cFYI(1, "Send error in Query POSIX ACL = %d", rc);
        } else {
                /* decode response */
@@ -2853,7 +2889,7 @@ CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned = 0;
        __u16 params, byte_count, data_count, param_offset, offset;
-        cFYI(1, ("In SetPosixACL (Unix) for path %s", fileName));
+        cFYI(1, "In SetPosixACL (Unix) for path %s", fileName);
 setAclRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -2908,7 +2944,7 @@ setAclRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("Set POSIX ACL returned %d", rc));
+                cFYI(1, "Set POSIX ACL returned %d", rc);
 setACLerrorExit:
        cifs_buf_release(pSMB);
@@ -2928,7 +2964,7 @@ CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned;
        __u16 params, byte_count;
-        cFYI(1, ("In GetExtAttr"));
+        cFYI(1, "In GetExtAttr");
        if (tcon == NULL)
                return -ENODEV;
@@ -2967,7 +3003,7 @@ GetExtAttrRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("error %d in GetExtAttr", rc));
+                cFYI(1, "error %d in GetExtAttr", rc);
        } else {
                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -2982,7 +3018,7 @@ GetExtAttrRetry:
                        struct file_chattr_info *pfinfo;
                        /* BB Do we need a cast or hash here ? */
                        if (count != 16) {
-                                cFYI(1, ("Illegal size ret in GetExtAttr"));
+                                cFYI(1, "Illegal size ret in GetExtAttr");
                                rc = -EIO;
                                goto GetExtAttrOut;
                        }
@@ -3012,7 +3048,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
        QUERY_SEC_DESC_REQ *pSMB;
        struct kvec iov[1];
-        cFYI(1, ("GetCifsACL"));
+        cFYI(1, "GetCifsACL");
        *pbuflen = 0;
        *acl_inf = NULL;
@@ -3037,7 +3073,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
                         CIFS_STD_OP);
        cifs_stats_inc(&tcon->num_acl_get);
        if (rc) {
-                cFYI(1, ("Send error in QuerySecDesc = %d", rc));
+                cFYI(1, "Send error in QuerySecDesc = %d", rc);
        } else {                /* decode response */
                __le32 *parm;
                __u32 parm_len;
@@ -3052,7 +3088,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
                        goto qsec_out;
                pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
-                cFYI(1, ("smb %p parm %p data %p", pSMBr, parm, *acl_inf));
+                cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf);
                if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
                        rc = -EIO;      /* bad smb */
@@ -3064,8 +3100,8 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
                acl_len = le32_to_cpu(*parm);
                if (acl_len != *pbuflen) {
-                        cERROR(1, ("acl length %d does not match %d",
+                        cERROR(1, "acl length %d does not match %d",
-                                   acl_len, *pbuflen));
+                                   acl_len, *pbuflen);
                        if (*pbuflen > acl_len)
                                *pbuflen = acl_len;
                }
@@ -3074,7 +3110,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
                   header followed by the smallest SID */
                if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
                    (*pbuflen >= 64 * 1024)) {
-                        cERROR(1, ("bad acl length %d", *pbuflen));
+                        cERROR(1, "bad acl length %d", *pbuflen);
                        rc = -EINVAL;
                        *pbuflen = 0;
                } else {
@@ -3148,9 +3184,9 @@ setCifsAclRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-        cFYI(1, ("SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc));
+        cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc);
        if (rc)
-                cFYI(1, ("Set CIFS ACL returned %d", rc));
+                cFYI(1, "Set CIFS ACL returned %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -3174,7 +3210,7 @@ int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned;
        int name_len;
-        cFYI(1, ("In SMBQPath path %s", searchName));
+        cFYI(1, "In SMBQPath path %s", searchName);
 QInfRetry:
        rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -3200,7 +3236,7 @@ QInfRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QueryInfo = %d", rc));
+                cFYI(1, "Send error in QueryInfo = %d", rc);
        } else if (pFinfo) {
                struct timespec ts;
                __u32 time = le32_to_cpu(pSMBr->last_write_time);
@@ -3230,8 +3266,72 @@ QInfRetry:
        return rc;
 }
+int
+CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+                 u16 netfid, FILE_ALL_INFO *pFindData)
+{
+        struct smb_t2_qfi_req *pSMB = NULL;
+        struct smb_t2_qfi_rsp *pSMBr = NULL;
+        int rc = 0;
+        int bytes_returned;
+        __u16 params, byte_count;
+QFileInfoRetry:
+        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+                      (void **) &pSMBr);
+        if (rc)
+                return rc;
+        params = 2 /* level */ + 2 /* fid */;
+        pSMB->t2.TotalDataCount = 0;
+        pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+        /* BB find exact max data count below from sess structure BB */
+        pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+        pSMB->t2.MaxSetupCount = 0;
+        pSMB->t2.Reserved = 0;
+        pSMB->t2.Flags = 0;
+        pSMB->t2.Timeout = 0;
+        pSMB->t2.Reserved2 = 0;
+        pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+                                               Fid) - 4);
+        pSMB->t2.DataCount = 0;
+        pSMB->t2.DataOffset = 0;
+        pSMB->t2.SetupCount = 1;
+        pSMB->t2.Reserved3 = 0;
+        pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+        byte_count = params + 1 /* pad */ ;
+        pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+        pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
+        pSMB->Pad = 0;
+        pSMB->Fid = netfid;
+        pSMB->hdr.smb_buf_length += byte_count;
+        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+        if (rc) {
+                cFYI(1, "Send error in QPathInfo = %d", rc);
+        } else {                /* decode response */
+                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+                if (rc) /* BB add auto retry on EOPNOTSUPP? */
+                        rc = -EIO;
+                else if (pSMBr->ByteCount < 40)
+                        rc = -EIO;      /* bad smb */
+                else if (pFindData) {
+                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+                        memcpy((char *) pFindData,
+                               (char *) &pSMBr->hdr.Protocol +
+                               data_offset, sizeof(FILE_ALL_INFO));
+                } else
+                    rc = -ENOMEM;
+        }
+        cifs_buf_release(pSMB);
+        if (rc == -EAGAIN)
+                goto QFileInfoRetry;
+        return rc;
+}
 int
 CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
@@ -3248,7 +3348,7 @@ CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
        int name_len;
        __u16 params, byte_count;
-/* cFYI(1, ("In QPathInfo path %s", searchName)); */
+/* cFYI(1, "In QPathInfo path %s", searchName); */
 QPathInfoRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -3298,7 +3398,7 @@ QPathInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QPathInfo = %d", rc));
+                cFYI(1, "Send error in QPathInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3335,6 +3435,75 @@ QPathInfoRetry:
 }
 int
+CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+                 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
+{
+        struct smb_t2_qfi_req *pSMB = NULL;
+        struct smb_t2_qfi_rsp *pSMBr = NULL;
+        int rc = 0;
+        int bytes_returned;
+        __u16 params, byte_count;
+UnixQFileInfoRetry:
+        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+                      (void **) &pSMBr);
+        if (rc)
+                return rc;
+        params = 2 /* level */ + 2 /* fid */;
+        pSMB->t2.TotalDataCount = 0;
+        pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+        /* BB find exact max data count below from sess structure BB */
+        pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+        pSMB->t2.MaxSetupCount = 0;
+        pSMB->t2.Reserved = 0;
+        pSMB->t2.Flags = 0;
+        pSMB->t2.Timeout = 0;
+        pSMB->t2.Reserved2 = 0;
+        pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+                                               Fid) - 4);
+        pSMB->t2.DataCount = 0;
+        pSMB->t2.DataOffset = 0;
+        pSMB->t2.SetupCount = 1;
+        pSMB->t2.Reserved3 = 0;
+        pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+        byte_count = params + 1 /* pad */ ;
+        pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+        pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
+        pSMB->Pad = 0;
+        pSMB->Fid = netfid;
+        pSMB->hdr.smb_buf_length += byte_count;
+        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+        if (rc) {
+                cFYI(1, "Send error in QPathInfo = %d", rc);
+        } else {                /* decode response */
+                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+                if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
+                        cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
+                                   "Unix Extensions can be disabled on mount "
+                                   "by specifying the nosfu mount option.");
+                        rc = -EIO;      /* bad smb */
+                } else {
+                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+                        memcpy((char *) pFindData,
+                               (char *) &pSMBr->hdr.Protocol +
+                               data_offset,
+                               sizeof(FILE_UNIX_BASIC_INFO));
+                }
+        }
+        cifs_buf_release(pSMB);
+        if (rc == -EAGAIN)
+                goto UnixQFileInfoRetry;
+        return rc;
+}
+int
 CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
                     const unsigned char *searchName,
                     FILE_UNIX_BASIC_INFO *pFindData,
@@ -3348,7 +3517,7 @@ CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
        int name_len;
        __u16 params, byte_count;
-        cFYI(1, ("In QPathInfo (Unix) the path %s", searchName));
+        cFYI(1, "In QPathInfo (Unix) the path %s", searchName);
 UnixQPathInfoRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -3395,14 +3564,14 @@ UnixQPathInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QPathInfo = %d", rc));
+                cFYI(1, "Send error in QPathInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
                if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
-                        cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
+                        cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
                                   "Unix Extensions can be disabled on mount "
-                                   "by specifying the nosfu mount option."));
+                                   "by specifying the nosfu mount option.");
                        rc = -EIO;      /* bad smb */
                } else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3436,7 +3605,7 @@ CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
        int name_len;
        __u16 params, byte_count;
-        cFYI(1, ("In FindFirst for %s", searchName));
+        cFYI(1, "In FindFirst for %s", searchName);
 findFirstRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3513,7 +3682,7 @@ findFirstRetry:
        if (rc) {/* BB add logic to retry regular search if Unix search
                        rejected unexpectedly by server */
                /* BB Add code to handle unsupported level rc */
-                cFYI(1, ("Error in FindFirst = %d", rc));
+                cFYI(1, "Error in FindFirst = %d", rc);
                cifs_buf_release(pSMB);
@@ -3552,7 +3721,7 @@ findFirstRetry:
                        lnoff = le16_to_cpu(parms->LastNameOffset);
                        if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
                              lnoff) {
-                                cERROR(1, ("ignoring corrupt resume name"));
+                                cERROR(1, "ignoring corrupt resume name");
                                psrch_inf->last_entry = NULL;
                                return rc;
                        }
@@ -3580,7 +3749,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned, name_len;
        __u16 params, byte_count;
-        cFYI(1, ("In FindNext"));
+        cFYI(1, "In FindNext");
        if (psrch_inf->endOfSearch)
                return -ENOENT;
@@ -3644,7 +3813,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
                        cifs_buf_release(pSMB);
                        rc = 0; /* search probably was closed at end of search*/
                } else
-                        cFYI(1, ("FindNext returned = %d", rc));
+                        cFYI(1, "FindNext returned = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3680,15 +3849,15 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
                        lnoff = le16_to_cpu(parms->LastNameOffset);
                        if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
                              lnoff) {
-                                cERROR(1, ("ignoring corrupt resume name"));
+                                cERROR(1, "ignoring corrupt resume name");
                                psrch_inf->last_entry = NULL;
                                return rc;
                        } else
                                psrch_inf->last_entry =
                                        psrch_inf->srch_entries_start + lnoff;
-/*  cFYI(1,("fnxt2 entries in buf %d index_of_last %d",
+/*  cFYI(1, "fnxt2 entries in buf %d index_of_last %d",
-            psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */
+            psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
                        /* BB fixme add unlock here */
                }
@@ -3713,7 +3882,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
        int rc = 0;
        FINDCLOSE_REQ *pSMB = NULL;
-        cFYI(1, ("In CIFSSMBFindClose"));
+        cFYI(1, "In CIFSSMBFindClose");
        rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
        /* no sense returning error if session restarted
@@ -3727,7 +3896,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
        pSMB->ByteCount = 0;
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc)
-                cERROR(1, ("Send error in FindClose = %d", rc));
+                cERROR(1, "Send error in FindClose = %d", rc);
        cifs_stats_inc(&tcon->num_fclose);
@@ -3750,7 +3919,7 @@ CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
        int name_len, bytes_returned;
        __u16 params, byte_count;
-        cFYI(1, ("In GetSrvInodeNum for %s", searchName));
+        cFYI(1, "In GetSrvInodeNum for %s", searchName);
        if (tcon == NULL)
                return -ENODEV;
@@ -3800,7 +3969,7 @@ GetInodeNumberRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("error %d in QueryInternalInfo", rc));
+                cFYI(1, "error %d in QueryInternalInfo", rc);
        } else {
                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3815,7 +3984,7 @@ GetInodeNumberRetry:
                        struct file_internal_info *pfinfo;
                        /* BB Do we need a cast or hash here ? */
                        if (count < 8) {
-                                cFYI(1, ("Illegal size ret in QryIntrnlInf"));
+                                cFYI(1, "Illegal size ret in QryIntrnlInf");
                                rc = -EIO;
                                goto GetInodeNumOut;
                        }
@@ -3856,16 +4025,16 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
        *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
        if (*num_of_nodes < 1) {
-                cERROR(1, ("num_referrals: must be at least > 0,"
+                cERROR(1, "num_referrals: must be at least > 0,"
-                        "but we get num_referrals = %d\n", *num_of_nodes));
+                        "but we get num_referrals = %d\n", *num_of_nodes);
                rc = -EINVAL;
                goto parse_DFS_referrals_exit;
        }
        ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
        if (ref->VersionNumber != cpu_to_le16(3)) {
-                cERROR(1, ("Referrals of V%d version are not supported,"
+                cERROR(1, "Referrals of V%d version are not supported,"
-                        "should be V3", le16_to_cpu(ref->VersionNumber)));
+                        "should be V3", le16_to_cpu(ref->VersionNumber));
                rc = -EINVAL;
                goto parse_DFS_referrals_exit;
        }
@@ -3874,19 +4043,19 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
        data_end = (char *)(&(pSMBr->PathConsumed)) +
                                le16_to_cpu(pSMBr->t2.DataCount);
-        cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n",
+        cFYI(1, "num_referrals: %d dfs flags: 0x%x ...\n",
                        *num_of_nodes,
-                        le32_to_cpu(pSMBr->DFSFlags)));
+                        le32_to_cpu(pSMBr->DFSFlags));
        *target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
                        *num_of_nodes, GFP_KERNEL);
        if (*target_nodes == NULL) {
-                cERROR(1, ("Failed to allocate buffer for target_nodes\n"));
+                cERROR(1, "Failed to allocate buffer for target_nodes\n");
                rc = -ENOMEM;
                goto parse_DFS_referrals_exit;
        }
-        /* collect neccessary data from referrals */
+        /* collect necessary data from referrals */
        for (i = 0; i < *num_of_nodes; i++) {
                char *temp;
                int max_len;
@@ -3957,7 +4126,7 @@ CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
        *num_of_nodes = 0;
        *target_nodes = NULL;
-        cFYI(1, ("In GetDFSRefer the path %s", searchName));
+        cFYI(1, "In GetDFSRefer the path %s", searchName);
        if (ses == NULL)
                return -ENODEV;
 getDFSRetry:
@@ -4024,7 +4193,7 @@ getDFSRetry:
        rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in GetDFSRefer = %d", rc));
+                cFYI(1, "Send error in GetDFSRefer = %d", rc);
                goto GetDFSRefExit;
        }
        rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4035,9 +4204,9 @@ getDFSRetry:
                goto GetDFSRefExit;
        }
-        cFYI(1, ("Decoding GetDFSRefer response BCC: %d  Offset %d",
+        cFYI(1, "Decoding GetDFSRefer response BCC: %d  Offset %d",
                                pSMBr->ByteCount,
-                                le16_to_cpu(pSMBr->t2.DataOffset)));
+                                le16_to_cpu(pSMBr->t2.DataOffset));
        /* parse returned result into more usable form */
        rc = parse_DFS_referrals(pSMBr, num_of_nodes,
@@ -4065,7 +4234,7 @@ SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("OldQFSInfo"));
+        cFYI(1, "OldQFSInfo");
 oldQFSInfoRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                (void **) &pSMBr);
@@ -4098,7 +4267,7 @@ oldQFSInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QFSInfo = %d", rc));
+                cFYI(1, "Send error in QFSInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4106,8 +4275,8 @@ oldQFSInfoRetry:
                        rc = -EIO;      /* bad smb */
                else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-                        cFYI(1, ("qfsinf resp BCC: %d  Offset %d",
+                        cFYI(1, "qfsinf resp BCC: %d  Offset %d",
-                                 pSMBr->ByteCount, data_offset));
+                                 pSMBr->ByteCount, data_offset);
                        response_data = (FILE_SYSTEM_ALLOC_INFO *)
                                (((char *) &pSMBr->hdr.Protocol) + data_offset);
@@ -4119,11 +4288,10 @@ oldQFSInfoRetry:
                               le32_to_cpu(response_data->TotalAllocationUnits);
                        FSData->f_bfree = FSData->f_bavail =
                                le32_to_cpu(response_data->FreeAllocationUnits);
-                        cFYI(1,
+                        cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
-                             ("Blocks: %lld  Free: %lld Block size %ld",
+                             (unsigned long long)FSData->f_blocks,
-                              (unsigned long long)FSData->f_blocks,
+                             (unsigned long long)FSData->f_bfree,
-                              (unsigned long long)FSData->f_bfree,
+                             FSData->f_bsize);
-                              FSData->f_bsize));
                }
        }
        cifs_buf_release(pSMB);
@@ -4145,7 +4313,7 @@ CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSInfo"));
+        cFYI(1, "In QFSInfo");
 QFSInfoRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4178,7 +4346,7 @@ QFSInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QFSInfo = %d", rc));
+                cFYI(1, "Send error in QFSInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4199,11 +4367,10 @@ QFSInfoRetry:
                            le64_to_cpu(response_data->TotalAllocationUnits);
                        FSData->f_bfree = FSData->f_bavail =
                            le64_to_cpu(response_data->FreeAllocationUnits);
-                        cFYI(1,
+                        cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
-                             ("Blocks: %lld  Free: %lld Block size %ld",
+                             (unsigned long long)FSData->f_blocks,
-                              (unsigned long long)FSData->f_blocks,
+                             (unsigned long long)FSData->f_bfree,
-                              (unsigned long long)FSData->f_bfree,
+                             FSData->f_bsize);
-                              FSData->f_bsize));
                }
        }
        cifs_buf_release(pSMB);
@@ -4225,7 +4392,7 @@ CIFSSMBQFSAttributeInfo(const int xid, struct cifsTconInfo *tcon)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSAttributeInfo"));
+        cFYI(1, "In QFSAttributeInfo");
 QFSAttributeRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4259,7 +4426,7 @@ QFSAttributeRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cERROR(1, ("Send error in QFSAttributeInfo = %d", rc));
+                cERROR(1, "Send error in QFSAttributeInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4295,7 +4462,7 @@ CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSDeviceInfo"));
+        cFYI(1, "In QFSDeviceInfo");
 QFSDeviceRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4330,7 +4497,7 @@ QFSDeviceRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QFSDeviceInfo = %d", rc));
+                cFYI(1, "Send error in QFSDeviceInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4365,7 +4532,7 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSUnixInfo"));
+        cFYI(1, "In QFSUnixInfo");
 QFSUnixRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4399,7 +4566,7 @@ QFSUnixRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cERROR(1, ("Send error in QFSUnixInfo = %d", rc));
+                cERROR(1, "Send error in QFSUnixInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4434,7 +4601,7 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count;
-        cFYI(1, ("In SETFSUnixInfo"));
+        cFYI(1, "In SETFSUnixInfo");
 SETFSUnixRetry:
        /* BB switch to small buf init to save memory */
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -4482,7 +4649,7 @@ SETFSUnixRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cERROR(1, ("Send error in SETFSUnixInfo = %d", rc));
+                cERROR(1, "Send error in SETFSUnixInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
                if (rc)
@@ -4510,7 +4677,7 @@ CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSPosixInfo"));
+        cFYI(1, "In QFSPosixInfo");
 QFSPosixRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4544,7 +4711,7 @@ QFSPosixRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QFSUnixInfo = %d", rc));
+                cFYI(1, "Send error in QFSUnixInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4604,7 +4771,7 @@ CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, const char *fileName,
        int bytes_returned = 0;
        __u16 params, byte_count, data_count, param_offset, offset;
-        cFYI(1, ("In SetEOF"));
+        cFYI(1, "In SetEOF");
 SetEOFRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4670,7 +4837,7 @@ SetEOFRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("SetPathInfo (file size) returned %d", rc));
+                cFYI(1, "SetPathInfo (file size) returned %d", rc);
        cifs_buf_release(pSMB);
@@ -4690,8 +4857,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
        int rc = 0;
        __u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("SetFileSize (via SetFileInfo) %lld",
+        cFYI(1, "SetFileSize (via SetFileInfo) %lld",
-                        (long long)size));
+                        (long long)size);
        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
        if (rc)
@@ -4750,9 +4917,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc) {
-                cFYI(1,
+                cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc);
-                     ("Send error in SetFileInfo (SetFileSize) = %d",
-                      rc));
        }
        /* Note: On -EAGAIN error only caller can retry on handle based calls
@@ -4776,7 +4941,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        int rc = 0;
        __u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("Set Times (via SetFileInfo)"));
+        cFYI(1, "Set Times (via SetFileInfo)");
        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
        if (rc)
@@ -4821,7 +4986,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc)
-                cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
+                cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
                since file handle passed in no longer valid */
@@ -4838,7 +5003,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
        int rc = 0;
        __u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("Set File Disposition (via SetFileInfo)"));
+        cFYI(1, "Set File Disposition (via SetFileInfo)");
        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
        if (rc)
@@ -4880,7 +5045,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
        *data_offset = delete_file ? 1 : 0;
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc)
-                cFYI(1, ("Send error in SetFileDisposition = %d", rc));
+                cFYI(1, "Send error in SetFileDisposition = %d", rc);
        return rc;
 }
@@ -4898,7 +5063,7 @@ CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
        char *data_offset;
        __u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("In SetTimes"));
+        cFYI(1, "In SetTimes");
 SetTimesRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -4954,7 +5119,7 @@ SetTimesRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("SetPathInfo (times) returned %d", rc));
+                cFYI(1, "SetPathInfo (times) returned %d", rc);
        cifs_buf_release(pSMB);
@@ -4979,7 +5144,7 @@ CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName,
        int bytes_returned;
        int name_len;
-        cFYI(1, ("In SetAttrLegacy"));
+        cFYI(1, "In SetAttrLegacy");
 SetAttrLgcyRetry:
        rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
@@ -5005,7 +5170,7 @@ SetAttrLgcyRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("Error in LegacySetAttr = %d", rc));
+                cFYI(1, "Error in LegacySetAttr = %d", rc);
        cifs_buf_release(pSMB);
@@ -5067,7 +5232,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        int rc = 0;
        u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("Set Unix Info (via SetFileInfo)"));
+        cFYI(1, "Set Unix Info (via SetFileInfo)");
        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
        if (rc)
@@ -5112,7 +5277,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc)
-                cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
+                cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
                since file handle passed in no longer valid */
@@ -5133,7 +5298,7 @@ CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
        FILE_UNIX_BASIC_INFO *data_offset;
        __u16 params, param_offset, offset, count, byte_count;
-        cFYI(1, ("In SetUID/GID/Mode"));
+        cFYI(1, "In SetUID/GID/Mode");
 setPermsRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -5189,7 +5354,7 @@ setPermsRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("SetPathInfo (perms) returned %d", rc));
+                cFYI(1, "SetPathInfo (perms) returned %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -5208,7 +5373,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
        struct dir_notify_req *dnotify_req;
        int bytes_returned;
-        cFYI(1, ("In CIFSSMBNotify for file handle %d", (int)netfid));
+        cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
        rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
        if (rc)
@@ -5242,7 +5407,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
                         (struct smb_hdr *)pSMBr, &bytes_returned,
                         CIFS_ASYNC_OP);
        if (rc) {
-                cFYI(1, ("Error in Notify = %d", rc));
+                cFYI(1, "Error in Notify = %d", rc);
        } else {
                /* Add file to outstanding requests */
                /* BB change to kmem cache alloc */
@@ -5298,7 +5463,7 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
        char *end_of_smb;
        __u16 params, byte_count, data_offset;
-        cFYI(1, ("In Query All EAs path %s", searchName));
+        cFYI(1, "In Query All EAs path %s", searchName);
 QAllEAsRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -5345,7 +5510,7 @@ QAllEAsRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QueryAllEAs = %d", rc));
+                cFYI(1, "Send error in QueryAllEAs = %d", rc);
                goto QAllEAsOut;
        }
@@ -5373,16 +5538,16 @@ QAllEAsRetry:
                                (((char *) &pSMBr->hdr.Protocol) + data_offset);
        list_len = le32_to_cpu(ea_response_data->list_len);
-        cFYI(1, ("ea length %d", list_len));
+        cFYI(1, "ea length %d", list_len);
        if (list_len <= 8) {
-                cFYI(1, ("empty EA list returned from server"));
+                cFYI(1, "empty EA list returned from server");
                goto QAllEAsOut;
        }
        /* make sure list_len doesn't go past end of SMB */
        end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
        if ((char *)ea_response_data + list_len > end_of_smb) {
-                cFYI(1, ("EA list appears to go beyond SMB"));
+                cFYI(1, "EA list appears to go beyond SMB");
                rc = -EIO;
                goto QAllEAsOut;
        }
@@ -5399,7 +5564,7 @@ QAllEAsRetry:
                temp_ptr += 4;
                /* make sure we can read name_len and value_len */
                if (list_len < 0) {
-                        cFYI(1, ("EA entry goes beyond length of list"));
+                        cFYI(1, "EA entry goes beyond length of list");
                        rc = -EIO;
                        goto QAllEAsOut;
                }
@@ -5408,7 +5573,7 @@ QAllEAsRetry:
                value_len = le16_to_cpu(temp_fea->value_len);
                list_len -= name_len + 1 + value_len;
                if (list_len < 0) {
-                        cFYI(1, ("EA entry goes beyond length of list"));
+                        cFYI(1, "EA entry goes beyond length of list");
                        rc = -EIO;
                        goto QAllEAsOut;
                }
@@ -5475,7 +5640,7 @@ CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName,
        int bytes_returned = 0;
        __u16 params, param_offset, byte_count, offset, count;
-        cFYI(1, ("In SetEA"));
+        cFYI(1, "In SetEA");
 SetEARetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -5557,7 +5722,7 @@ SetEARetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("SetPathInfo (EA) returned %d", rc));
+                cFYI(1, "SetPathInfo (EA) returned %d", rc);
        cifs_buf_release(pSMB);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 45eb6cba793f..2208f06e4c45 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/wait.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
@@ -101,6 +102,7 @@ struct smb_vol {
        bool sockopt_tcp_nodelay:1;
        unsigned short int port;
        char *prepath;
+        struct nls_table *local_nls;
 };
 static int ipv4_connect(struct TCP_Server_Info *server);
@@ -134,7 +136,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
        spin_unlock(&GlobalMid_Lock);
        server->maxBuf = 0;
-        cFYI(1, ("Reconnecting tcp session"));
+        cFYI(1, "Reconnecting tcp session");
        /* before reconnecting the tcp session, mark the smb session (uid)
                and the tid bad so they are not used until reconnected */
@@ -152,12 +154,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
        /* do not want to be sending data on a socket we are freeing */
        mutex_lock(&server->srv_mutex);
        if (server->ssocket) {
-                cFYI(1, ("State: 0x%x Flags: 0x%lx", server->ssocket->state,
+                cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
-                        server->ssocket->flags));
+                        server->ssocket->flags);
                kernel_sock_shutdown(server->ssocket, SHUT_WR);
-                cFYI(1, ("Post shutdown state: 0x%x Flags: 0x%lx",
+                cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx",
                        server->ssocket->state,
-                        server->ssocket->flags));
+                        server->ssocket->flags);
                sock_release(server->ssocket);
                server->ssocket = NULL;
        }
@@ -186,7 +188,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
                else
                        rc = ipv4_connect(server);
                if (rc) {
-                        cFYI(1, ("reconnect error %d", rc));
+                        cFYI(1, "reconnect error %d", rc);
                        msleep(3000);
                } else {
                        atomic_inc(&tcpSesReconnectCount);
@@ -222,7 +224,7 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
        /* check for plausible wct, bcc and t2 data and parm sizes */
        /* check for parm and data offset going beyond end of smb */
        if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
-                cFYI(1, ("invalid transact2 word count"));
+                cFYI(1, "invalid transact2 word count");
                return -EINVAL;
        }
@@ -236,15 +238,15 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
        if (remaining == 0)
                return 0;
        else if (remaining < 0) {
-                cFYI(1, ("total data %d smaller than data in frame %d",
+                cFYI(1, "total data %d smaller than data in frame %d",
-                        total_data_size, data_in_this_rsp));
+                        total_data_size, data_in_this_rsp);
                return -EINVAL;
        } else {
-                cFYI(1, ("missing %d bytes from transact2, check next response",
+                cFYI(1, "missing %d bytes from transact2, check next response",
-                        remaining));
+                        remaining);
                if (total_data_size > maxBufSize) {
-                        cERROR(1, ("TotalDataSize %d is over maximum buffer %d",
+                        cERROR(1, "TotalDataSize %d is over maximum buffer %d",
-                                total_data_size, maxBufSize));
+                                total_data_size, maxBufSize);
                        return -EINVAL;
                }
                return remaining;
@@ -266,7 +268,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
        total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
        if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) {
-                cFYI(1, ("total data size of primary and secondary t2 differ"));
+                cFYI(1, "total data size of primary and secondary t2 differ");
        }
        total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount);
@@ -281,7 +283,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
        total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount);
        if (remaining < total_in_buf2) {
-                cFYI(1, ("transact2 2nd response contains too much data"));
+                cFYI(1, "transact2 2nd response contains too much data");
        }
        /* find end of first SMB data area */
@@ -310,7 +312,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
        pTargetSMB->smb_buf_length = byte_count;
        if (remaining == total_in_buf2) {
-                cFYI(1, ("found the last secondary response"));
+                cFYI(1, "found the last secondary response");
                return 0; /* we are done */
        } else /* more responses to go */
                return 1;
@@ -338,7 +340,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
        int reconnect;
        current->flags |= PF_MEMALLOC;
-        cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current)));
+        cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
        length = atomic_inc_return(&tcpSesAllocCount);
        if (length > 1)
@@ -352,7 +354,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                if (bigbuf == NULL) {
                        bigbuf = cifs_buf_get();
                        if (!bigbuf) {
-                                cERROR(1, ("No memory for large SMB response"));
+                                cERROR(1, "No memory for large SMB response");
                                msleep(3000);
                                /* retry will check if exiting */
                                continue;
@@ -365,7 +367,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                if (smallbuf == NULL) {
                        smallbuf = cifs_small_buf_get();
                        if (!smallbuf) {
-                                cERROR(1, ("No memory for SMB response"));
+                                cERROR(1, "No memory for SMB response");
                                msleep(1000);
                                /* retry will check if exiting */
                                continue;
@@ -390,9 +392,9 @@ incomplete_rcv:
                if (server->tcpStatus == CifsExiting) {
                        break;
                } else if (server->tcpStatus == CifsNeedReconnect) {
-                        cFYI(1, ("Reconnect after server stopped responding"));
+                        cFYI(1, "Reconnect after server stopped responding");
                        cifs_reconnect(server);
-                        cFYI(1, ("call to reconnect done"));
+                        cFYI(1, "call to reconnect done");
                        csocket = server->ssocket;
                        continue;
                } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) {
@@ -410,7 +412,7 @@ incomplete_rcv:
                                continue;
                } else if (length <= 0) {
                        if (server->tcpStatus == CifsNew) {
-                                cFYI(1, ("tcp session abend after SMBnegprot"));
+                                cFYI(1, "tcp session abend after SMBnegprot");
                                /* some servers kill the TCP session rather than
                                   returning an SMB negprot error, in which
                                   case reconnecting here is not going to help,
@@ -418,18 +420,18 @@ incomplete_rcv:
                                break;
                        }
                        if (!try_to_freeze() && (length == -EINTR)) {
-                                cFYI(1, ("cifsd thread killed"));
+                                cFYI(1, "cifsd thread killed");
                                break;
                        }
-                        cFYI(1, ("Reconnect after unexpected peek error %d",
+                        cFYI(1, "Reconnect after unexpected peek error %d",
-                                length));
+                                length);
                        cifs_reconnect(server);
                        csocket = server->ssocket;
                        wake_up(&server->response_q);
                        continue;
                } else if (length < pdu_length) {
-                        cFYI(1, ("requested %d bytes but only got %d bytes",
+                        cFYI(1, "requested %d bytes but only got %d bytes",
-                                  pdu_length, length));
+                                  pdu_length, length);
                        pdu_length -= length;
                        msleep(1);
                        goto incomplete_rcv;
@@ -449,18 +451,18 @@ incomplete_rcv:
                pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
                smb_buffer->smb_buf_length = pdu_length;
-                cFYI(1, ("rfc1002 length 0x%x", pdu_length+4));
+                cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
                if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
                        continue;
                } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
-                        cFYI(1, ("Good RFC 1002 session rsp"));
+                        cFYI(1, "Good RFC 1002 session rsp");
                        continue;
                } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
                        /* we get this from Windows 98 instead of
                           an error on SMB negprot response */
-                        cFYI(1, ("Negative RFC1002 Session Response Error 0x%x)",
+                        cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
-                                pdu_length));
+                                pdu_length);
                        if (server->tcpStatus == CifsNew) {
                                /* if nack on negprot (rather than
                                ret of smb negprot error) reconnecting
@@ -483,7 +485,7 @@ incomplete_rcv:
                                continue;
                        }
                } else if (temp != (char) 0) {
-                        cERROR(1, ("Unknown RFC 1002 frame"));
+                        cERROR(1, "Unknown RFC 1002 frame");
                        cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
                                      length);
                        cifs_reconnect(server);
@@ -494,8 +496,8 @@ incomplete_rcv:
                /* else we have an SMB response */
                if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
                            (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
-                        cERROR(1, ("Invalid size SMB length %d pdu_length %d",
+                        cERROR(1, "Invalid size SMB length %d pdu_length %d",
-                                        length, pdu_length+4));
+                                        length, pdu_length+4);
                        cifs_reconnect(server);
                        csocket = server->ssocket;
                        wake_up(&server->response_q);
@@ -538,8 +540,8 @@ incomplete_rcv:
                                length = 0;
                                continue;
                        } else if (length <= 0) {
-                                cERROR(1, ("Received no data, expecting %d",
+                                cERROR(1, "Received no data, expecting %d",
-                                              pdu_length - total_read));
+                                              pdu_length - total_read);
                                cifs_reconnect(server);
                                csocket = server->ssocket;
                                reconnect = 1;
@@ -587,7 +589,7 @@ incomplete_rcv:
                                                }
                                        } else {
                                                if (!isLargeBuf) {
-                                                        cERROR(1,("1st trans2 resp needs bigbuf"));
+                                                        cERROR(1, "1st trans2 resp needs bigbuf");
                                        /* BB maybe we can fix this up,  switch
                                           to already allocated large buffer? */
                                                } else {
@@ -629,8 +631,8 @@ multi_t2_fnd:
                        wake_up_process(task_to_wake);
                } else if (!is_valid_oplock_break(smb_buffer, server) &&
                           !isMultiRsp) {
-                        cERROR(1, ("No task to wake, unknown frame received! "
+                        cERROR(1, "No task to wake, unknown frame received! "
-                                   "NumMids %d", midCount.counter));
+                                   "NumMids %d", midCount.counter);
                        cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
                                      sizeof(struct smb_hdr));
 #ifdef CONFIG_CIFS_DEBUG2
@@ -707,8 +709,8 @@ multi_t2_fnd:
                list_for_each(tmp, &server->pending_mid_q) {
                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
                        if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
-                                cFYI(1, ("Clearing Mid 0x%x - waking up ",
+                                cFYI(1, "Clearing Mid 0x%x - waking up ",
-                                         mid_entry->mid));
+                                         mid_entry->mid);
                                task_to_wake = mid_entry->tsk;
                                if (task_to_wake)
                                        wake_up_process(task_to_wake);
@@ -727,7 +729,7 @@ multi_t2_fnd:
                to wait at least 45 seconds before giving up
                on a request getting a response and going ahead
                and killing cifsd */
-                cFYI(1, ("Wait for exit from demultiplex thread"));
+                cFYI(1, "Wait for exit from demultiplex thread");
                msleep(46000);
                /* if threads still have not exited they are probably never
                coming home not much else we can do but free the memory */
@@ -848,7 +850,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        separator[0] = options[4];
                        options += 5;
                } else {
-                        cFYI(1, ("Null separator not allowed"));
+                        cFYI(1, "Null separator not allowed");
                }
        }
@@ -973,7 +975,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        }
                } else if (strnicmp(data, "sec", 3) == 0) {
                        if (!value || !*value) {
-                                cERROR(1, ("no security value specified"));
+                                cERROR(1, "no security value specified");
                                continue;
                        } else if (strnicmp(value, "krb5i", 5) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_KRB5 |
@@ -981,7 +983,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        } else if (strnicmp(value, "krb5p", 5) == 0) {
                                /* vol->secFlg |= CIFSSEC_MUST_SEAL |
                                        CIFSSEC_MAY_KRB5; */
-                                cERROR(1, ("Krb5 cifs privacy not supported"));
+                                cERROR(1, "Krb5 cifs privacy not supported");
                                return 1;
                        } else if (strnicmp(value, "krb5", 4) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_KRB5;
@@ -1013,7 +1015,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        } else if (strnicmp(value, "none", 4) == 0) {
                                vol->nullauth = 1;
                        } else {
-                                cERROR(1, ("bad security option: %s", value));
+                                cERROR(1, "bad security option: %s", value);
                                return 1;
                        }
                } else if ((strnicmp(data, "unc", 3) == 0)
@@ -1052,7 +1054,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        a domain name and need special handling? */
                        if (strnlen(value, 256) < 256) {
                                vol->domainname = value;
-                                cFYI(1, ("Domain name set"));
+                                cFYI(1, "Domain name set");
                        } else {
                                printk(KERN_WARNING "CIFS: domain name too "
                                                    "long\n");
@@ -1075,7 +1077,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                                        strcpy(vol->prepath+1, value);
                                } else
                                        strcpy(vol->prepath, value);
-                                cFYI(1, ("prefix path %s", vol->prepath));
+                                cFYI(1, "prefix path %s", vol->prepath);
                        } else {
                                printk(KERN_WARNING "CIFS: prefix too long\n");
                                return 1;
@@ -1091,7 +1093,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                                        vol->iocharset = value;
                                /* if iocharset not set then load_nls_default
                                   is used by caller */
-                                cFYI(1, ("iocharset set to %s", value));
+                                cFYI(1, "iocharset set to %s", value);
                        } else {
                                printk(KERN_WARNING "CIFS: iocharset name "
                                                    "too long.\n");
@@ -1143,14 +1145,14 @@ cifs_parse_mount_options(char *options, const char *devname,
                        }
                } else if (strnicmp(data, "sockopt", 5) == 0) {
                        if (!value || !*value) {
-                                cERROR(1, ("no socket option specified"));
+                                cERROR(1, "no socket option specified");
                                continue;
                        } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
                                vol->sockopt_tcp_nodelay = 1;
                        }
                } else if (strnicmp(data, "netbiosname", 4) == 0) {
                        if (!value || !*value || (*value == ' ')) {
-                                cFYI(1, ("invalid (empty) netbiosname"));
+                                cFYI(1, "invalid (empty) netbiosname");
                        } else {
                                memset(vol->source_rfc1001_name, 0x20, 15);
                                for (i = 0; i < 15; i++) {
@@ -1174,7 +1176,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                } else if (strnicmp(data, "servern", 7) == 0) {
                        /* servernetbiosname specified override *SMBSERVER */
                        if (!value || !*value || (*value == ' ')) {
-                                cFYI(1, ("empty server netbiosname specified"));
+                                cFYI(1, "empty server netbiosname specified");
                        } else {
                                /* last byte, type, is 0x20 for servr type */
                                memset(vol->target_rfc1001_name, 0x20, 16);
@@ -1433,7 +1435,7 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
                ++server->srv_count;
                write_unlock(&cifs_tcp_ses_lock);
-                cFYI(1, ("Existing tcp session with server found"));
+                cFYI(1, "Existing tcp session with server found");
                return server;
        }
        write_unlock(&cifs_tcp_ses_lock);
@@ -1474,7 +1476,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        memset(&addr, 0, sizeof(struct sockaddr_storage));
-        cFYI(1, ("UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip));
+        cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
        if (volume_info->UNCip && volume_info->UNC) {
                rc = cifs_convert_address(volume_info->UNCip, &addr);
@@ -1486,13 +1488,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        } else if (volume_info->UNCip) {
                /* BB using ip addr as tcp_ses name to connect to the
                   DFS root below */
-                cERROR(1, ("Connecting to DFS root not implemented yet"));
+                cERROR(1, "Connecting to DFS root not implemented yet");
                rc = -EINVAL;
                goto out_err;
        } else /* which tcp_sess DFS root would we conect to */ {
-                cERROR(1,
+                cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
-                       ("CIFS mount error: No UNC path (e.g. -o "
+                        "unc=//192.168.1.100/public) specified");
-                        "unc=//192.168.1.100/public) specified"));
                rc = -EINVAL;
                goto out_err;
        }
@@ -1539,7 +1540,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        ++tcp_ses->srv_count;
        if (addr.ss_family == AF_INET6) {
-                cFYI(1, ("attempting ipv6 connect"));
+                cFYI(1, "attempting ipv6 connect");
                /* BB should we allow ipv6 on port 139? */
                /* other OS never observed in Wild doing 139 with v6 */
                sin_server6->sin6_port = htons(volume_info->port);
@@ -1553,7 +1554,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                rc = ipv4_connect(tcp_ses);
        }
        if (rc < 0) {
-                cERROR(1, ("Error connecting to socket. Aborting operation"));
+                cERROR(1, "Error connecting to socket. Aborting operation");
                goto out_err;
        }
@@ -1566,7 +1567,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                                  tcp_ses, "cifsd");
        if (IS_ERR(tcp_ses->tsk)) {
                rc = PTR_ERR(tcp_ses->tsk);
-                cERROR(1, ("error %d create cifsd thread", rc));
+                cERROR(1, "error %d create cifsd thread", rc);
                module_put(THIS_MODULE);
                goto out_err;
        }
@@ -1615,6 +1616,7 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
        int xid;
        struct TCP_Server_Info *server = ses->server;
+        cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
        write_lock(&cifs_tcp_ses_lock);
        if (--ses->ses_count > 0) {
                write_unlock(&cifs_tcp_ses_lock);
@@ -1633,6 +1635,102 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
        cifs_put_tcp_session(server);
 }
+static struct cifsSesInfo *
+cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
+{
+        int rc = -ENOMEM, xid;
+        struct cifsSesInfo *ses;
+        xid = GetXid();
+        ses = cifs_find_smb_ses(server, volume_info->username);
+        if (ses) {
+                cFYI(1, "Existing smb sess found (status=%d)", ses->status);
+                /* existing SMB ses has a server reference already */
+                cifs_put_tcp_session(server);
+                mutex_lock(&ses->session_mutex);
+                rc = cifs_negotiate_protocol(xid, ses);
+                if (rc) {
+                        mutex_unlock(&ses->session_mutex);
+                        /* problem -- put our ses reference */
+                        cifs_put_smb_ses(ses);
+                        FreeXid(xid);
+                        return ERR_PTR(rc);
+                }
+                if (ses->need_reconnect) {
+                        cFYI(1, "Session needs reconnect");
+                        rc = cifs_setup_session(xid, ses,
+                                                volume_info->local_nls);
+                        if (rc) {
+                                mutex_unlock(&ses->session_mutex);
+                                /* problem -- put our reference */
+                                cifs_put_smb_ses(ses);
+                                FreeXid(xid);
+                                return ERR_PTR(rc);
+                        }
+                }
+                mutex_unlock(&ses->session_mutex);
+                FreeXid(xid);
+                return ses;
+        }
+        cFYI(1, "Existing smb sess not found");
+        ses = sesInfoAlloc();
+        if (ses == NULL)
+                goto get_ses_fail;
+        /* new SMB session uses our server ref */
+        ses->server = server;
+        if (server->addr.sockAddr6.sin6_family == AF_INET6)
+                sprintf(ses->serverName, "%pI6",
+                        &server->addr.sockAddr6.sin6_addr);
+        else
+                sprintf(ses->serverName, "%pI4",
+                        &server->addr.sockAddr.sin_addr.s_addr);
+        if (volume_info->username)
+                strncpy(ses->userName, volume_info->username,
+                        MAX_USERNAME_SIZE);
+        /* volume_info->password freed at unmount */
+        if (volume_info->password) {
+                ses->password = kstrdup(volume_info->password, GFP_KERNEL);
+                if (!ses->password)
+                        goto get_ses_fail;
+        }
+        if (volume_info->domainname) {
+                int len = strlen(volume_info->domainname);
+                ses->domainName = kmalloc(len + 1, GFP_KERNEL);
+                if (ses->domainName)
+                        strcpy(ses->domainName, volume_info->domainname);
+        }
+        ses->linux_uid = volume_info->linux_uid;
+        ses->overrideSecFlg = volume_info->secFlg;
+        mutex_lock(&ses->session_mutex);
+        rc = cifs_negotiate_protocol(xid, ses);
+        if (!rc)
+                rc = cifs_setup_session(xid, ses, volume_info->local_nls);
+        mutex_unlock(&ses->session_mutex);
+        if (rc)
+                goto get_ses_fail;
+        /* success, put it on the list */
+        write_lock(&cifs_tcp_ses_lock);
+        list_add(&ses->smb_ses_list, &server->smb_ses_list);
+        write_unlock(&cifs_tcp_ses_lock);
+        FreeXid(xid);
+        return ses;
+get_ses_fail:
+        sesInfoFree(ses);
+        FreeXid(xid);
+        return ERR_PTR(rc);
+}
 static struct cifsTconInfo *
 cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
 {
@@ -1661,6 +1759,7 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
        int xid;
        struct cifsSesInfo *ses = tcon->ses;
+        cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
        write_lock(&cifs_tcp_ses_lock);
        if (--tcon->tc_count > 0) {
                write_unlock(&cifs_tcp_ses_lock);
@@ -1678,6 +1777,80 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
        cifs_put_smb_ses(ses);
 }
+static struct cifsTconInfo *
+cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
+{
+        int rc, xid;
+        struct cifsTconInfo *tcon;
+        tcon = cifs_find_tcon(ses, volume_info->UNC);
+        if (tcon) {
+                cFYI(1, "Found match on UNC path");
+                /* existing tcon already has a reference */
+                cifs_put_smb_ses(ses);
+                if (tcon->seal != volume_info->seal)
+                        cERROR(1, "transport encryption setting "
+                                   "conflicts with existing tid");
+                return tcon;
+        }
+        tcon = tconInfoAlloc();
+        if (tcon == NULL) {
+                rc = -ENOMEM;
+                goto out_fail;
+        }
+        tcon->ses = ses;
+        if (volume_info->password) {
+                tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
+                if (!tcon->password) {
+                        rc = -ENOMEM;
+                        goto out_fail;
+                }
+        }
+        if (strchr(volume_info->UNC + 3, '\\') == NULL
+            && strchr(volume_info->UNC + 3, '/') == NULL) {
+                cERROR(1, "Missing share name");
+                rc = -ENODEV;
+                goto out_fail;
+        }
+        /* BB Do we need to wrap session_mutex around
+         * this TCon call and Unix SetFS as
+         * we do on SessSetup and reconnect? */
+        xid = GetXid();
+        rc = CIFSTCon(xid, ses, volume_info->UNC, tcon, volume_info->local_nls);
+        FreeXid(xid);
+        cFYI(1, "CIFS Tcon rc = %d", rc);
+        if (rc)
+                goto out_fail;
+        if (volume_info->nodfs) {
+                tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
+                cFYI(1, "DFS disabled (%d)", tcon->Flags);
+        }
+        tcon->seal = volume_info->seal;
+        /* we can have only one retry value for a connection
+           to a share so for resources mounted more than once
+           to the same server share the last value passed in
+           for the retry flag is used */
+        tcon->retry = volume_info->retry;
+        tcon->nocase = volume_info->nocase;
+        tcon->local_lease = volume_info->local_lease;
+        write_lock(&cifs_tcp_ses_lock);
+        list_add(&tcon->tcon_list, &ses->tcon_list);
+        write_unlock(&cifs_tcp_ses_lock);
+        return tcon;
+out_fail:
+        tconInfoFree(tcon);
+        return ERR_PTR(rc);
+}
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
             const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -1702,8 +1875,7 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
                strcpy(temp_unc + 2, pSesInfo->serverName);
                strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$");
                rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage);
-                cFYI(1,
+                cFYI(1, "CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid);
-                     ("CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid));
                kfree(temp_unc);
        }
        if (rc == 0)
@@ -1776,12 +1948,12 @@ ipv4_connect(struct TCP_Server_Info *server)
                rc = sock_create_kern(PF_INET, SOCK_STREAM,
                                      IPPROTO_TCP, &socket);
                if (rc < 0) {
-                        cERROR(1, ("Error %d creating socket", rc));
+                        cERROR(1, "Error %d creating socket", rc);
                        return rc;
                }
                /* BB other socket options to set KEEPALIVE, NODELAY? */
-                cFYI(1, ("Socket created"));
+                cFYI(1, "Socket created");
                server->ssocket = socket;
                socket->sk->sk_allocation = GFP_NOFS;
                cifs_reclassify_socket4(socket);
@@ -1826,7 +1998,7 @@ ipv4_connect(struct TCP_Server_Info *server)
        if (!connected) {
                if (orig_port)
                        server->addr.sockAddr.sin_port = orig_port;
-                cFYI(1, ("Error %d connecting to server via ipv4", rc));
+                cFYI(1, "Error %d connecting to server via ipv4", rc);
                sock_release(socket);
                server->ssocket = NULL;
                return rc;
@@ -1854,12 +2026,12 @@ ipv4_connect(struct TCP_Server_Info *server)
                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
                                (char *)&val, sizeof(val));
                if (rc)
-                        cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
+                        cFYI(1, "set TCP_NODELAY socket option error %d", rc);
        }
-         cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
+         cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
                 socket->sk->sk_sndbuf,
-                 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo));
+                 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
        /* send RFC1001 sessinit */
        if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
@@ -1937,13 +2109,13 @@ ipv6_connect(struct TCP_Server_Info *server)
                rc = sock_create_kern(PF_INET6, SOCK_STREAM,
                                      IPPROTO_TCP, &socket);
                if (rc < 0) {
-                        cERROR(1, ("Error %d creating ipv6 socket", rc));
+                        cERROR(1, "Error %d creating ipv6 socket", rc);
                        socket = NULL;
                        return rc;
                }
                /* BB other socket options to set KEEPALIVE, NODELAY? */
-                cFYI(1, ("ipv6 Socket created"));
+                cFYI(1, "ipv6 Socket created");
                server->ssocket = socket;
                socket->sk->sk_allocation = GFP_NOFS;
                cifs_reclassify_socket6(socket);
@@ -1987,7 +2159,7 @@ ipv6_connect(struct TCP_Server_Info *server)
        if (!connected) {
                if (orig_port)
                        server->addr.sockAddr6.sin6_port = orig_port;
-                cFYI(1, ("Error %d connecting to server via ipv6", rc));
+                cFYI(1, "Error %d connecting to server via ipv6", rc);
                sock_release(socket);
                server->ssocket = NULL;
                return rc;
@@ -2006,7 +2178,7 @@ ipv6_connect(struct TCP_Server_Info *server)
                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
                                (char *)&val, sizeof(val));
                if (rc)
-                        cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
+                        cFYI(1, "set TCP_NODELAY socket option error %d", rc);
        }
        server->ssocket = socket;
@@ -2031,13 +2203,13 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
        if (vol_info && vol_info->no_linux_ext) {
                tcon->fsUnixInfo.Capability = 0;
                tcon->unix_ext = 0; /* Unix Extensions disabled */
-                cFYI(1, ("Linux protocol extensions disabled"));
+                cFYI(1, "Linux protocol extensions disabled");
                return;
        } else if (vol_info)
                tcon->unix_ext = 1; /* Unix Extensions supported */
        if (tcon->unix_ext == 0) {
-                cFYI(1, ("Unix extensions disabled so not set on reconnect"));
+                cFYI(1, "Unix extensions disabled so not set on reconnect");
                return;
        }
@@ -2053,12 +2225,11 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                                cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
                        if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
                                if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-                                        cERROR(1, ("POSIXPATH support change"));
+                                        cERROR(1, "POSIXPATH support change");
                                cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
                        } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
-                                cERROR(1, ("possible reconnect error"));
+                                cERROR(1, "possible reconnect error");
-                                cERROR(1,
+                                cERROR(1, "server disabled POSIX path support");
-                                        ("server disabled POSIX path support"));
                        }
                }
@@ -2066,7 +2237,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                if (vol_info && vol_info->no_psx_acl)
                        cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
                else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
-                        cFYI(1, ("negotiated posix acl support"));
+                        cFYI(1, "negotiated posix acl support");
                        if (sb)
                                sb->s_flags |= MS_POSIXACL;
                }
@@ -2074,7 +2245,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                if (vol_info && vol_info->posix_paths == 0)
                        cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
                else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
-                        cFYI(1, ("negotiate posix pathnames"));
+                        cFYI(1, "negotiate posix pathnames");
                        if (sb)
                                CIFS_SB(sb)->mnt_cifs_flags |=
                                        CIFS_MOUNT_POSIX_PATHS;
@@ -2089,39 +2260,38 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) {
                        if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
                                CIFS_SB(sb)->rsize = 127 * 1024;
-                                cFYI(DBG2,
+                                cFYI(DBG2, "larger reads not supported by srv");
-                                        ("larger reads not supported by srv"));
                        }
                }
-                cFYI(1, ("Negotiate caps 0x%x", (int)cap));
+                cFYI(1, "Negotiate caps 0x%x", (int)cap);
 #ifdef CONFIG_CIFS_DEBUG2
                if (cap & CIFS_UNIX_FCNTL_CAP)
-                        cFYI(1, ("FCNTL cap"));
+                        cFYI(1, "FCNTL cap");
                if (cap & CIFS_UNIX_EXTATTR_CAP)
-                        cFYI(1, ("EXTATTR cap"));
+                        cFYI(1, "EXTATTR cap");
                if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-                        cFYI(1, ("POSIX path cap"));
+                        cFYI(1, "POSIX path cap");
                if (cap & CIFS_UNIX_XATTR_CAP)
-                        cFYI(1, ("XATTR cap"));
+                        cFYI(1, "XATTR cap");
                if (cap & CIFS_UNIX_POSIX_ACL_CAP)
-                        cFYI(1, ("POSIX ACL cap"));
+                        cFYI(1, "POSIX ACL cap");
                if (cap & CIFS_UNIX_LARGE_READ_CAP)
-                        cFYI(1, ("very large read cap"));
+                        cFYI(1, "very large read cap");
                if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
-                        cFYI(1, ("very large write cap"));
+                        cFYI(1, "very large write cap");
 #endif /* CIFS_DEBUG2 */
                if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
                        if (vol_info == NULL) {
-                                cFYI(1, ("resetting capabilities failed"));
+                                cFYI(1, "resetting capabilities failed");
                        } else
-                                cERROR(1, ("Negotiating Unix capabilities "
+                                cERROR(1, "Negotiating Unix capabilities "
                                           "with the server failed.  Consider "
                                           "mounting with the Unix Extensions\n"
                                           "disabled, if problems are found, "
                                           "by specifying the nounix mount "
-                                           "option."));
+                                           "option.");
                }
        }
@@ -2151,8 +2321,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                          struct cifs_sb_info *cifs_sb)
 {
        if (pvolume_info->rsize > CIFSMaxBufSize) {
-                cERROR(1, ("rsize %d too large, using MaxBufSize",
+                cERROR(1, "rsize %d too large, using MaxBufSize",
-                        pvolume_info->rsize));
+                        pvolume_info->rsize);
                cifs_sb->rsize = CIFSMaxBufSize;
        } else if ((pvolume_info->rsize) &&
                        (pvolume_info->rsize <= CIFSMaxBufSize))
@@ -2161,8 +2331,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                cifs_sb->rsize = CIFSMaxBufSize;
        if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
-                cERROR(1, ("wsize %d too large, using 4096 instead",
+                cERROR(1, "wsize %d too large, using 4096 instead",
-                          pvolume_info->wsize));
+                          pvolume_info->wsize);
                cifs_sb->wsize = 4096;
        } else if (pvolume_info->wsize)
                cifs_sb->wsize = pvolume_info->wsize;
@@ -2180,7 +2350,7 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        if (cifs_sb->rsize < 2048) {
                cifs_sb->rsize = 2048;
                /* Windows ME may prefer this */
-                cFYI(1, ("readsize set to minimum: 2048"));
+                cFYI(1, "readsize set to minimum: 2048");
        }
        /* calculate prepath */
        cifs_sb->prepath = pvolume_info->prepath;
@@ -2198,8 +2368,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        cifs_sb->mnt_gid = pvolume_info->linux_gid;
        cifs_sb->mnt_file_mode = pvolume_info->file_mode;
        cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
-        cFYI(1, ("file mode: 0x%x  dir mode: 0x%x",
+        cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
-                cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode));
+                cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
        if (pvolume_info->noperm)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
@@ -2228,13 +2398,13 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        if (pvolume_info->dynperm)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
        if (pvolume_info->direct_io) {
-                cFYI(1, ("mounting share using direct i/o"));
+                cFYI(1, "mounting share using direct i/o");
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
        }
        if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
-                cERROR(1, ("mount option dynperm ignored if cifsacl "
+                cERROR(1, "mount option dynperm ignored if cifsacl "
-                           "mount option supported"));
+                           "mount option supported");
 }
 static int
@@ -2261,7 +2431,7 @@ cleanup_volume_info(struct smb_vol **pvolume_info)
 {
        struct smb_vol *volume_info;
-        if (!pvolume_info && !*pvolume_info)
+        if (!pvolume_info || !*pvolume_info)
                return;
        volume_info = *pvolume_info;
@@ -2343,11 +2513,11 @@ try_mount_again:
        }
        if (volume_info->nullauth) {
-                cFYI(1, ("null user"));
+                cFYI(1, "null user");
                volume_info->username = "";
        } else if (volume_info->username) {
                /* BB fixme parse for domain name here */
-                cFYI(1, ("Username: %s", volume_info->username));
+                cFYI(1, "Username: %s", volume_info->username);
        } else {
                cifserror("No username specified");
        /* In userspace mount helper we can get user name from alternate
@@ -2356,20 +2526,20 @@ try_mount_again:
                goto out;
        }
        /* this is needed for ASCII cp to Unicode converts */
        if (volume_info->iocharset == NULL) {
-                cifs_sb->local_nls = load_nls_default();
+                /* load_nls_default cannot return null */
-        /* load_nls_default can not return null */
+                volume_info->local_nls = load_nls_default();
        } else {
-                cifs_sb->local_nls = load_nls(volume_info->iocharset);
+                volume_info->local_nls = load_nls(volume_info->iocharset);
-                if (cifs_sb->local_nls == NULL) {
+                if (volume_info->local_nls == NULL) {
-                        cERROR(1, ("CIFS mount error: iocharset %s not found",
+                        cERROR(1, "CIFS mount error: iocharset %s not found",
-                                 volume_info->iocharset));
+                                 volume_info->iocharset);
                        rc = -ELIBACC;
                        goto out;
                }
        }
+        cifs_sb->local_nls = volume_info->local_nls;
        /* get a reference to a tcp session */
        srvTcp = cifs_get_tcp_session(volume_info);
@@ -2378,148 +2548,30 @@ try_mount_again:
                goto out;
        }
-        pSesInfo = cifs_find_smb_ses(srvTcp, volume_info->username);
+        /* get a reference to a SMB session */
-        if (pSesInfo) {
+        pSesInfo = cifs_get_smb_ses(srvTcp, volume_info);
-                cFYI(1, ("Existing smb sess found (status=%d)",
+        if (IS_ERR(pSesInfo)) {
-                        pSesInfo->status));
+                rc = PTR_ERR(pSesInfo);
-                /*
+                pSesInfo = NULL;
-                 * The existing SMB session already has a reference to srvTcp,
+                goto mount_fail_check;
-                 * so we can put back the extra one we got before
-                 */
-                cifs_put_tcp_session(srvTcp);
-                mutex_lock(&pSesInfo->session_mutex);
-                if (pSesInfo->need_reconnect) {
-                        cFYI(1, ("Session needs reconnect"));
-                        rc = cifs_setup_session(xid, pSesInfo,
-                                                cifs_sb->local_nls);
-                }
-                mutex_unlock(&pSesInfo->session_mutex);
-        } else if (!rc) {
-                cFYI(1, ("Existing smb sess not found"));
-                pSesInfo = sesInfoAlloc();
-                if (pSesInfo == NULL) {
-                        rc = -ENOMEM;
-                        goto mount_fail_check;
-                }
-                /* new SMB session uses our srvTcp ref */
-                pSesInfo->server = srvTcp;
-                if (srvTcp->addr.sockAddr6.sin6_family == AF_INET6)
-                        sprintf(pSesInfo->serverName, "%pI6",
-                                &srvTcp->addr.sockAddr6.sin6_addr);
-                else
-                        sprintf(pSesInfo->serverName, "%pI4",
-                                &srvTcp->addr.sockAddr.sin_addr.s_addr);
-                write_lock(&cifs_tcp_ses_lock);
-                list_add(&pSesInfo->smb_ses_list, &srvTcp->smb_ses_list);
-                write_unlock(&cifs_tcp_ses_lock);
-                /* volume_info->password freed at unmount */
-                if (volume_info->password) {
-                        pSesInfo->password = kstrdup(volume_info->password,
-                                                     GFP_KERNEL);
-                        if (!pSesInfo->password) {
-                                rc = -ENOMEM;
-                                goto mount_fail_check;
-                        }
-                }
-                if (volume_info->username)
-                        strncpy(pSesInfo->userName, volume_info->username,
-                                MAX_USERNAME_SIZE);
-                if (volume_info->domainname) {
-                        int len = strlen(volume_info->domainname);
-                        pSesInfo->domainName = kmalloc(len + 1, GFP_KERNEL);
-                        if (pSesInfo->domainName)
-                                strcpy(pSesInfo->domainName,
-                                        volume_info->domainname);
-                }
-                pSesInfo->linux_uid = volume_info->linux_uid;
-                pSesInfo->overrideSecFlg = volume_info->secFlg;
-                mutex_lock(&pSesInfo->session_mutex);
-                /* BB FIXME need to pass vol->secFlgs BB */
-                rc = cifs_setup_session(xid, pSesInfo,
-                                        cifs_sb->local_nls);
-                mutex_unlock(&pSesInfo->session_mutex);
        }
-        /* search for existing tcon to this server share */
+        setup_cifs_sb(volume_info, cifs_sb);
-        if (!rc) {
+        if (pSesInfo->capabilities & CAP_LARGE_FILES)
-                setup_cifs_sb(volume_info, cifs_sb);
+                sb->s_maxbytes = MAX_LFS_FILESIZE;
+        else
-                tcon = cifs_find_tcon(pSesInfo, volume_info->UNC);
+                sb->s_maxbytes = MAX_NON_LFS;
-                if (tcon) {
-                        cFYI(1, ("Found match on UNC path"));
-                        /* existing tcon already has a reference */
-                        cifs_put_smb_ses(pSesInfo);
-                        if (tcon->seal != volume_info->seal)
-                                cERROR(1, ("transport encryption setting "
-                                           "conflicts with existing tid"));
-                } else {
-                        tcon = tconInfoAlloc();
-                        if (tcon == NULL) {
-                                rc = -ENOMEM;
-                                goto mount_fail_check;
-                        }
-                        tcon->ses = pSesInfo;
-                        if (volume_info->password) {
-                                tcon->password = kstrdup(volume_info->password,
-                                                         GFP_KERNEL);
-                                if (!tcon->password) {
-                                        rc = -ENOMEM;
-                                        goto mount_fail_check;
-                                }
-                        }
-                        if ((strchr(volume_info->UNC + 3, '\\') == NULL)
-                            && (strchr(volume_info->UNC + 3, '/') == NULL)) {
-                                cERROR(1, ("Missing share name"));
-                                rc = -ENODEV;
-                                goto mount_fail_check;
-                        } else {
-                                /* BB Do we need to wrap sesSem around
-                                 * this TCon call and Unix SetFS as
-                                 * we do on SessSetup and reconnect? */
-                                rc = CIFSTCon(xid, pSesInfo, volume_info->UNC,
-                                              tcon, cifs_sb->local_nls);
-                                cFYI(1, ("CIFS Tcon rc = %d", rc));
-                                if (volume_info->nodfs) {
-                                        tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
-                                        cFYI(1, ("DFS disabled (%d)",
-                                                tcon->Flags));
-                                }
-                        }
-                        if (rc)
-                                goto remote_path_check;
-                        tcon->seal = volume_info->seal;
-                        write_lock(&cifs_tcp_ses_lock);
-                        list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
-                        write_unlock(&cifs_tcp_ses_lock);
-                }
-                /* we can have only one retry value for a connection
-                   to a share so for resources mounted more than once
-                   to the same server share the last value passed in
-                   for the retry flag is used */
-                tcon->retry = volume_info->retry;
-                tcon->nocase = volume_info->nocase;
-                tcon->local_lease = volume_info->local_lease;
-        }
-        if (pSesInfo) {
-                if (pSesInfo->capabilities & CAP_LARGE_FILES)
-                        sb->s_maxbytes = MAX_LFS_FILESIZE;
-                else
-                        sb->s_maxbytes = MAX_NON_LFS;
-        }
        /* BB FIXME fix time_gran to be larger for LANMAN sessions */
        sb->s_time_gran = 100;
-        if (rc)
+        /* search for existing tcon to this server share */
+        tcon = cifs_get_tcon(pSesInfo, volume_info);
+        if (IS_ERR(tcon)) {
+                rc = PTR_ERR(tcon);
+                tcon = NULL;
                goto remote_path_check;
+        }
        cifs_sb->tcon = tcon;
@@ -2543,7 +2595,7 @@ try_mount_again:
        if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
                cifs_sb->rsize = 1024 * 127;
-                cFYI(DBG2, ("no very large read support, rsize now 127K"));
+                cFYI(DBG2, "no very large read support, rsize now 127K");
        }
        if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
                cifs_sb->wsize = min(cifs_sb->wsize,
@@ -2592,7 +2644,7 @@ remote_path_check:
                        goto mount_fail_check;
                }
-                cFYI(1, ("Getting referral for: %s", full_path));
+                cFYI(1, "Getting referral for: %s", full_path);
                rc = get_dfs_path(xid, pSesInfo , full_path + 1,
                        cifs_sb->local_nls, &num_referrals, &referrals,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -2706,7 +2758,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                   by Samba (not sure whether other servers allow
                   NTLMv2 password here) */
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-                if ((extended_security & CIFSSEC_MAY_LANMAN) &&
+                if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
                    (ses->server->secType == LANMAN))
                        calc_lanman_hash(tcon->password, ses->server->cryptKey,
                                         ses->server->secMode &
@@ -2777,13 +2829,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                if (length == 3) {
                        if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
                            (bcc_ptr[2] == 'C')) {
-                                cFYI(1, ("IPC connection"));
+                                cFYI(1, "IPC connection");
                                tcon->ipc = 1;
                        }
                } else if (length == 2) {
                        if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
                                /* the most common case */
-                                cFYI(1, ("disk share connection"));
+                                cFYI(1, "disk share connection");
                        }
                }
                bcc_ptr += length + 1;
@@ -2796,7 +2848,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                                                      bytes_left, is_unicode,
                                                      nls_codepage);
-                cFYI(1, ("nativeFileSystem=%s", tcon->nativeFileSystem));
+                cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem);
                if ((smb_buffer_response->WordCount == 3) ||
                         (smb_buffer_response->WordCount == 7))
@@ -2804,7 +2856,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                        tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
                else
                        tcon->Flags = 0;
-                cFYI(1, ("Tcon flags: 0x%x ", tcon->Flags));
+                cFYI(1, "Tcon flags: 0x%x ", tcon->Flags);
        } else if ((rc == 0) && tcon == NULL) {
                /* all we need to save for IPC$ connection */
                ses->ipc_tid = smb_buffer_response->Tid;
@@ -2832,57 +2884,61 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
        return rc;
 }
-int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
+int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
-                                           struct nls_table *nls_info)
 {
        int rc = 0;
-        int first_time = 0;
+        struct TCP_Server_Info *server = ses->server;
-        struct TCP_Server_Info *server = pSesInfo->server;
+        /* only send once per connect */
-        /* what if server changes its buffer size after dropping the session? */
+        if (server->maxBuf != 0)
-        if (server->maxBuf == 0) /* no need to send on reconnect */ {
+                return 0;
-                rc = CIFSSMBNegotiate(xid, pSesInfo);
-                if (rc == -EAGAIN) {
+        rc = CIFSSMBNegotiate(xid, ses);
-                        /* retry only once on 1st time connection */
+        if (rc == -EAGAIN) {
-                        rc = CIFSSMBNegotiate(xid, pSesInfo);
+                /* retry only once on 1st time connection */
-                        if (rc == -EAGAIN)
+                rc = CIFSSMBNegotiate(xid, ses);
-                                rc = -EHOSTDOWN;
+                if (rc == -EAGAIN)
-                }
+                        rc = -EHOSTDOWN;
-                if (rc == 0) {
+        }
-                        spin_lock(&GlobalMid_Lock);
+        if (rc == 0) {
-                        if (server->tcpStatus != CifsExiting)
+                spin_lock(&GlobalMid_Lock);
-                                server->tcpStatus = CifsGood;
+                if (server->tcpStatus != CifsExiting)
-                        else
+                        server->tcpStatus = CifsGood;
-                                rc = -EHOSTDOWN;
+                else
-                        spin_unlock(&GlobalMid_Lock);
+                        rc = -EHOSTDOWN;
+                spin_unlock(&GlobalMid_Lock);
-                }
-                first_time = 1;
        }
-        if (rc)
+        return rc;
-                goto ss_err_exit;
+}
+int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
+                        struct nls_table *nls_info)
+{
+        int rc = 0;
+        struct TCP_Server_Info *server = ses->server;
-        pSesInfo->flags = 0;
+        ses->flags = 0;
-        pSesInfo->capabilities = server->capabilities;
+        ses->capabilities = server->capabilities;
        if (linuxExtEnabled == 0)
-                pSesInfo->capabilities &= (~CAP_UNIX);
+                ses->capabilities &= (~CAP_UNIX);
-        cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
+        cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
-                 server->secMode, server->capabilities, server->timeAdj));
+                 server->secMode, server->capabilities, server->timeAdj);
-        rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
+        rc = CIFS_SessSetup(xid, ses, nls_info);
        if (rc) {
-                cERROR(1, ("Send error in SessSetup = %d", rc));
+                cERROR(1, "Send error in SessSetup = %d", rc);
        } else {
-                cFYI(1, ("CIFS Session Established successfully"));
+                cFYI(1, "CIFS Session Established successfully");
                spin_lock(&GlobalMid_Lock);
-                pSesInfo->status = CifsGood;
+                ses->status = CifsGood;
-                pSesInfo->need_reconnect = false;
+                ses->need_reconnect = false;
                spin_unlock(&GlobalMid_Lock);
        }
-ss_err_exit:
        return rc;
 }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6ccf7262d1b7..391816b461ca 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -73,7 +73,7 @@ cifs_bp_rename_retry:
                namelen += (1 + temp->d_name.len);
                temp = temp->d_parent;
                if (temp == NULL) {
-                        cERROR(1, ("corrupt dentry"));
+                        cERROR(1, "corrupt dentry");
                        return NULL;
                }
        }
@@ -90,19 +90,18 @@ cifs_bp_rename_retry:
                        full_path[namelen] = dirsep;
                        strncpy(full_path + namelen + 1, temp->d_name.name,
                                temp->d_name.len);
-                        cFYI(0, ("name: %s", full_path + namelen));
+                        cFYI(0, "name: %s", full_path + namelen);
                }
                temp = temp->d_parent;
                if (temp == NULL) {
-                        cERROR(1, ("corrupt dentry"));
+                        cERROR(1, "corrupt dentry");
                        kfree(full_path);
                        return NULL;
                }
        }
        if (namelen != pplen + dfsplen) {
-                cERROR(1,
+                cERROR(1, "did not end path lookup where expected namelen is %d",
-                       ("did not end path lookup where expected namelen is %d",
+                        namelen);
-                        namelen));
                /* presumably this is only possible if racing with a rename
                of one of the parent directories  (we can not lock the dentries
                above us to prevent this, but retrying should be harmless) */
@@ -130,6 +129,12 @@ cifs_bp_rename_retry:
        return full_path;
 }
+/*
+ * When called with struct file pointer set to NULL, there is no way we could
+ * update file->private_data, but getting it stuck on openFileList provides a
+ * way to access it from cifs_fill_filedata and thereby set file->private_data
+ * from cifs_open.
+ */
 struct cifsFileInfo *
 cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
                  struct file *file, struct vfsmount *mnt, unsigned int oflags)
@@ -173,7 +178,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
                if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
                        pCifsInode->clientCanCacheAll = true;
                        pCifsInode->clientCanCacheRead = true;
-                        cFYI(1, ("Exclusive Oplock inode %p", newinode));
+                        cFYI(1, "Exclusive Oplock inode %p", newinode);
                } else if ((oplock & 0xF) == OPLOCK_READ)
                                pCifsInode->clientCanCacheRead = true;
        }
@@ -183,16 +188,17 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
 }
 int cifs_posix_open(char *full_path, struct inode **pinode,
-                    struct vfsmount *mnt, int mode, int oflags,
+                        struct vfsmount *mnt, struct super_block *sb,
-                    __u32 *poplock, __u16 *pnetfid, int xid)
+                        int mode, int oflags,
+                        __u32 *poplock, __u16 *pnetfid, int xid)
 {
        int rc;
        FILE_UNIX_BASIC_INFO *presp_data;
        __u32 posix_flags = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct cifs_fattr fattr;
-        cFYI(1, ("posix open %s", full_path));
+        cFYI(1, "posix open %s", full_path);
        presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
        if (presp_data == NULL)
@@ -242,7 +248,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        /* get new inode and set it up */
        if (*pinode == NULL) {
-                *pinode = cifs_iget(mnt->mnt_sb, &fattr);
+                cifs_fill_uniqueid(sb, &fattr);
+                *pinode = cifs_iget(sb, &fattr);
                if (!*pinode) {
                        rc = -ENOMEM;
                        goto posix_open_ret;
@@ -251,7 +258,18 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
                cifs_fattr_to_inode(*pinode, &fattr);
        }
-        cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
+        /*
+         * cifs_fill_filedata() takes care of setting cifsFileInfo pointer to
+         * file->private_data.
+         */
+        if (mnt) {
+                struct cifsFileInfo *pfile_info;
+                pfile_info = cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt,
+                                               oflags);
+                if (pfile_info == NULL)
+                        rc = -ENOMEM;
+        }
 posix_open_ret:
        kfree(presp_data);
@@ -315,13 +333,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        if (nd && (nd->flags & LOOKUP_OPEN))
                oflags = nd->intent.open.flags;
        else
-                oflags = FMODE_READ;
+                oflags = FMODE_READ | SMB_O_CREAT;
        if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-                rc = cifs_posix_open(full_path, &newinode, nd->path.mnt,
+                rc = cifs_posix_open(full_path, &newinode,
-                                     mode, oflags, &oplock, &fileHandle, xid);
+                        nd ? nd->path.mnt : NULL,
+                        inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
                /* EIO could indicate that (posix open) operation is not
                   supported, despite what server claimed in capability
                   negotation.  EREMOTE indicates DFS junction, which is not
@@ -358,7 +377,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                else if ((oflags & O_CREAT) == O_CREAT)
                        disposition = FILE_OPEN_IF;
                else
-                        cFYI(1, ("Create flag not set in create function"));
+                        cFYI(1, "Create flag not set in create function");
        }
        /* BB add processing to set equivalent of mode - e.g. via CreateX with
@@ -394,7 +413,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        }
        if (rc) {
-                cFYI(1, ("cifs_create returned 0x%x", rc));
+                cFYI(1, "cifs_create returned 0x%x", rc);
                goto cifs_create_out;
        }
@@ -457,15 +476,22 @@ cifs_create_set_dentry:
        if (rc == 0)
                setup_cifs_dentry(tcon, direntry, newinode);
        else
-                cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc));
+                cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
        /* nfsd case - nfs srv does not set nd */
        if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
                /* mknod case - do not leave file open */
                CIFSSMBClose(xid, tcon, fileHandle);
        } else if (!(posix_create) && (newinode)) {
-                        cifs_new_fileinfo(newinode, fileHandle, NULL,
+                struct cifsFileInfo *pfile_info;
-                                                nd->path.mnt, oflags);
+                /*
+                 * cifs_fill_filedata() takes care of setting cifsFileInfo
+                 * pointer to file->private_data.
+                 */
+                pfile_info = cifs_new_fileinfo(newinode, fileHandle, NULL,
+                                               nd->path.mnt, oflags);
+                if (pfile_info == NULL)
+                        rc = -ENOMEM;
        }
 cifs_create_out:
        kfree(buf);
@@ -531,7 +557,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                        u16 fileHandle;
                        FILE_ALL_INFO *buf;
-                        cFYI(1, ("sfu compat create special file"));
+                        cFYI(1, "sfu compat create special file");
                        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
                        if (buf == NULL) {
@@ -616,8 +642,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        xid = GetXid();
-        cFYI(1, ("parent inode = 0x%p name is: %s and dentry = 0x%p",
+        cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
-              parent_dir_inode, direntry->d_name.name, direntry));
+              parent_dir_inode, direntry->d_name.name, direntry);
        /* check whether path exists */
@@ -632,7 +658,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                int i;
                for (i = 0; i < direntry->d_name.len; i++)
                        if (direntry->d_name.name[i] == '\\') {
-                                cFYI(1, ("Invalid file name"));
+                                cFYI(1, "Invalid file name");
                                FreeXid(xid);
                                return ERR_PTR(-EINVAL);
                        }
@@ -657,11 +683,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        }
        if (direntry->d_inode != NULL) {
-                cFYI(1, ("non-NULL inode in lookup"));
+                cFYI(1, "non-NULL inode in lookup");
        } else {
-                cFYI(1, ("NULL inode in lookup"));
+                cFYI(1, "NULL inode in lookup");
        }
-        cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode));
+        cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode);
        /* Posix open is only called (at lookup time) for file create now.
         * For opens (rather than creates), because we do not know if it
@@ -678,6 +704,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
                     (nd->intent.open.flags & O_CREAT)) {
                        rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
+                                        parent_dir_inode->i_sb,
                                        nd->intent.open.create_mode,
                                        nd->intent.open.flags, &oplock,
                                        &fileHandle, xid);
@@ -723,7 +750,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        /*      if it was once a directory (but how can we tell?) we could do
                shrink_dcache_parent(direntry); */
        } else if (rc != -EACCES) {
-                cERROR(1, ("Unexpected lookup error %d", rc));
+                cERROR(1, "Unexpected lookup error %d", rc);
                /* We special case check for Access Denied - since that
                is a common return code */
        }
@@ -739,11 +766,11 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
        int isValid = 1;
        if (direntry->d_inode) {
-                if (cifs_revalidate(direntry))
+                if (cifs_revalidate_dentry(direntry))
                        return 0;
        } else {
-                cFYI(1, ("neg dentry 0x%p name = %s",
+                cFYI(1, "neg dentry 0x%p name = %s",
-                         direntry, direntry->d_name.name));
+                         direntry, direntry->d_name.name);
                if (time_after(jiffies, direntry->d_time + HZ) ||
                        !lookupCacheEnabled) {
                        d_drop(direntry);
@@ -758,7 +785,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
        int rc = 0;
-        cFYI(1, ("In cifs d_delete, name = %s", direntry->d_name.name));
+        cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name);
        return rc;
 }     */
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 87948147d7ec..4db2c5e7283f 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -23,6 +23,7 @@
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
+#include <linux/slab.h>
 #include <keys/user-type.h>
 #include "dns_resolve.h"
 #include "cifsglob.h"
@@ -105,14 +106,14 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
        /* search for server name delimiter */
        len = strlen(unc);
        if (len < 3) {
-                cFYI(1, ("%s: unc is too short: %s", __func__, unc));
+                cFYI(1, "%s: unc is too short: %s", __func__, unc);
                return -EINVAL;
        }
        len -= 2;
        name = memchr(unc+2, '\\', len);
        if (!name) {
-                cFYI(1, ("%s: probably server name is whole unc: %s",
+                cFYI(1, "%s: probably server name is whole unc: %s",
-                                        __func__, unc));
+                                        __func__, unc);
        } else {
                len = (name - unc) - 2/* leading // */;
        }
@@ -126,8 +127,8 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
        name[len] = 0;
        if (is_ip(name)) {
-                cFYI(1, ("%s: it is IP, skipping dns upcall: %s",
+                cFYI(1, "%s: it is IP, skipping dns upcall: %s",
-                                        __func__, name));
+                                        __func__, name);
                data = name;
                goto skip_upcall;
        }
@@ -137,7 +138,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
                len = rkey->type_data.x[0];
                data = rkey->payload.data;
        } else {
-                cERROR(1, ("%s: unable to resolve: %s", __func__, name));
+                cERROR(1, "%s: unable to resolve: %s", __func__, name);
                goto out;
        }
@@ -147,10 +148,10 @@ skip_upcall:
                if (*ip_addr) {
                        memcpy(*ip_addr, data, len + 1);
                        if (!IS_ERR(rkey))
-                                cFYI(1, ("%s: resolved: %s to %s", __func__,
+                                cFYI(1, "%s: resolved: %s to %s", __func__,
                                                        name,
                                                        *ip_addr
-                                        ));
+                                        );
                        rc = 0;
                } else {
                        rc = -ENOMEM;
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 6177f7cca16a..993f82045bf6 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -49,7 +49,7 @@
 static struct dentry *cifs_get_parent(struct dentry *dentry)
 {
        /* BB need to add code here eventually to enable export via NFSD */
-        cFYI(1, ("get parent for %p", dentry));
+        cFYI(1, "get parent for %p", dentry);
        return ERR_PTR(-EACCES);
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3d8f8a96f5a3..a83541ec9713 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3,7 +3,7 @@
 *
 *   vfs operations that deal with files
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *              Jeremy Allison (jra@samba.org)
 *
@@ -31,6 +31,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/delay.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -107,8 +108,7 @@ static inline int cifs_get_disposition(unsigned int flags)
 /* all arguments to this function must be checked for validity in caller */
 static inline int
 cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
-                             struct cifsInodeInfo *pCifsInode,
+                             struct cifsInodeInfo *pCifsInode, __u32 oplock,
-                             struct cifsFileInfo *pCifsFile, __u32 oplock,
                             u16 netfid)
 {
@@ -135,15 +135,15 @@ cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
        if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
                           (file->f_path.dentry->d_inode->i_size ==
                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
-                cFYI(1, ("inode unchanged on server"));
+                cFYI(1, "inode unchanged on server");
        } else {
                if (file->f_path.dentry->d_inode->i_mapping) {
                        rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
                        if (rc != 0)
                                CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
                }
-                cFYI(1, ("invalidating remote inode since open detected it "
+                cFYI(1, "invalidating remote inode since open detected it "
-                         "changed"));
+                         "changed");
                invalidate_remote_inode(file->f_path.dentry->d_inode);
        } */
@@ -151,8 +151,8 @@ psx_client_can_cache:
        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
                pCifsInode->clientCanCacheAll = true;
                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, ("Exclusive Oplock granted on inode %p",
+                cFYI(1, "Exclusive Oplock granted on inode %p",
-                         file->f_path.dentry->d_inode));
+                         file->f_path.dentry->d_inode);
        } else if ((oplock & 0xF) == OPLOCK_READ)
                pCifsInode->clientCanCacheRead = true;
@@ -189,8 +189,8 @@ cifs_fill_filedata(struct file *file)
        if (file->private_data != NULL) {
                return pCifsFile;
        } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
-                        cERROR(1, ("could not find file instance for "
+                        cERROR(1, "could not find file instance for "
-                                   "new file %p", file));
+                                   "new file %p", file);
        return NULL;
 }
@@ -216,17 +216,17 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
        if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
                           (file->f_path.dentry->d_inode->i_size ==
                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
-                cFYI(1, ("inode unchanged on server"));
+                cFYI(1, "inode unchanged on server");
        } else {
                if (file->f_path.dentry->d_inode->i_mapping) {
-                /* BB no need to lock inode until after invalidate
+                        /* BB no need to lock inode until after invalidate
-                   since namei code should already have it locked? */
+                        since namei code should already have it locked? */
                        rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
                        if (rc != 0)
                                CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
                }
-                cFYI(1, ("invalidating remote inode since open detected it "
+                cFYI(1, "invalidating remote inode since open detected it "
-                         "changed"));
+                         "changed");
                invalidate_remote_inode(file->f_path.dentry->d_inode);
        }
@@ -241,8 +241,8 @@ client_can_cache:
        if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
                pCifsInode->clientCanCacheAll = true;
                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, ("Exclusive Oplock granted on inode %p",
+                cFYI(1, "Exclusive Oplock granted on inode %p",
-                         file->f_path.dentry->d_inode));
+                         file->f_path.dentry->d_inode);
        } else if ((*oplock & 0xF) == OPLOCK_READ)
                pCifsInode->clientCanCacheRead = true;
@@ -284,8 +284,8 @@ int cifs_open(struct inode *inode, struct file *file)
                return rc;
        }
-        cFYI(1, ("inode = 0x%p file flags are 0x%x for %s",
+        cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
-                 inode, file->f_flags, full_path));
+                 inode, file->f_flags, full_path);
        if (oplockEnabled)
                oplock = REQ_OPLOCK;
@@ -297,27 +297,29 @@ int cifs_open(struct inode *inode, struct file *file)
            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
                int oflags = (int) cifs_posix_convert_flags(file->f_flags);
+                oflags |= SMB_O_CREAT;
                /* can not refresh inode info since size could be stale */
                rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
-                                     cifs_sb->mnt_file_mode /* ignored */,
+                                inode->i_sb,
-                                     oflags, &oplock, &netfid, xid);
+                                cifs_sb->mnt_file_mode /* ignored */,
+                                oflags, &oplock, &netfid, xid);
                if (rc == 0) {
-                        cFYI(1, ("posix open succeeded"));
+                        cFYI(1, "posix open succeeded");
                        /* no need for special case handling of setting mode
                           on read only files needed here */
                        pCifsFile = cifs_fill_filedata(file);
                        cifs_posix_open_inode_helper(inode, file, pCifsInode,
-                                                     pCifsFile, oplock, netfid);
+                                                     oplock, netfid);
                        goto out;
                } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        if (tcon->ses->serverNOS)
-                                cERROR(1, ("server %s of type %s returned"
+                                cERROR(1, "server %s of type %s returned"
                                           " unexpected error on SMB posix open"
                                           ", disabling posix open support."
                                           " Check if server update available.",
                                           tcon->ses->serverName,
-                                           tcon->ses->serverNOS));
+                                           tcon->ses->serverNOS);
                        tcon->broken_posix_open = true;
                } else if ((rc != -EIO) && (rc != -EREMOTE) &&
                         (rc != -EOPNOTSUPP)) /* path not found or net err */
@@ -385,7 +387,7 @@ int cifs_open(struct inode *inode, struct file *file)
                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
        }
        if (rc) {
-                cFYI(1, ("cifs_open returned 0x%x", rc));
+                cFYI(1, "cifs_open returned 0x%x", rc);
                goto out;
        }
@@ -468,7 +470,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        }
        if (file->f_path.dentry == NULL) {
-                cERROR(1, ("no valid name if dentry freed"));
+                cERROR(1, "no valid name if dentry freed");
                dump_stack();
                rc = -EBADF;
                goto reopen_error_exit;
@@ -476,7 +478,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        inode = file->f_path.dentry->d_inode;
        if (inode == NULL) {
-                cERROR(1, ("inode not valid"));
+                cERROR(1, "inode not valid");
                dump_stack();
                rc = -EBADF;
                goto reopen_error_exit;
@@ -498,8 +500,8 @@ reopen_error_exit:
                return rc;
        }
-        cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
+        cFYI(1, "inode = 0x%p file flags 0x%x for %s",
-                 inode, file->f_flags, full_path));
+                 inode, file->f_flags, full_path);
        if (oplockEnabled)
                oplock = REQ_OPLOCK;
@@ -512,10 +514,11 @@ reopen_error_exit:
                int oflags = (int) cifs_posix_convert_flags(file->f_flags);
                /* can not refresh inode info since size could be stale */
                rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
-                                     cifs_sb->mnt_file_mode /* ignored */,
+                                inode->i_sb,
-                                     oflags, &oplock, &netfid, xid);
+                                cifs_sb->mnt_file_mode /* ignored */,
+                                oflags, &oplock, &netfid, xid);
                if (rc == 0) {
-                        cFYI(1, ("posix reopen succeeded"));
+                        cFYI(1, "posix reopen succeeded");
                        goto reopen_success;
                }
                /* fallthrough to retry open the old way on errors, especially
@@ -536,8 +539,8 @@ reopen_error_exit:
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
                mutex_unlock(&pCifsFile->fh_mutex);
-                cFYI(1, ("cifs_open returned 0x%x", rc));
+                cFYI(1, "cifs_open returned 0x%x", rc);
-                cFYI(1, ("oplock: %d", oplock));
+                cFYI(1, "oplock: %d", oplock);
        } else {
 reopen_success:
                pCifsFile->netfid = netfid;
@@ -569,8 +572,8 @@ reopen_success:
                        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
                                pCifsInode->clientCanCacheAll = true;
                                pCifsInode->clientCanCacheRead = true;
-                                cFYI(1, ("Exclusive Oplock granted on inode %p",
+                                cFYI(1, "Exclusive Oplock granted on inode %p",
-                                         file->f_path.dentry->d_inode));
+                                         file->f_path.dentry->d_inode);
                        } else if ((oplock & 0xF) == OPLOCK_READ) {
                                pCifsInode->clientCanCacheRead = true;
                                pCifsInode->clientCanCacheAll = false;
@@ -618,8 +621,7 @@ int cifs_close(struct inode *inode, struct file *file)
                                        the struct would be in each open file,
                                        but this should give enough time to
                                        clear the socket */
-                                        cFYI(DBG2,
+                                        cFYI(DBG2, "close delay, write pending");
-                                                ("close delay, write pending"));
                                        msleep(timeout);
                                        timeout *= 4;
                                }
@@ -652,7 +654,7 @@ int cifs_close(struct inode *inode, struct file *file)
        read_lock(&GlobalSMBSeslock);
        if (list_empty(&(CIFS_I(inode)->openFileList))) {
-                cFYI(1, ("closing last open instance for inode %p", inode));
+                cFYI(1, "closing last open instance for inode %p", inode);
                /* if the file is not open we do not know if we can cache info
                   on this inode, much less write behind and read ahead */
                CIFS_I(inode)->clientCanCacheRead = false;
@@ -673,7 +675,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
            (struct cifsFileInfo *)file->private_data;
        char *ptmp;
-        cFYI(1, ("Closedir inode = 0x%p", inode));
+        cFYI(1, "Closedir inode = 0x%p", inode);
        xid = GetXid();
@@ -684,22 +686,22 @@ int cifs_closedir(struct inode *inode, struct file *file)
                pTcon = cifs_sb->tcon;
-                cFYI(1, ("Freeing private data in close dir"));
+                cFYI(1, "Freeing private data in close dir");
                write_lock(&GlobalSMBSeslock);
                if (!pCFileStruct->srch_inf.endOfSearch &&
                    !pCFileStruct->invalidHandle) {
                        pCFileStruct->invalidHandle = true;
                        write_unlock(&GlobalSMBSeslock);
                        rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
-                        cFYI(1, ("Closing uncompleted readdir with rc %d",
+                        cFYI(1, "Closing uncompleted readdir with rc %d",
-                                 rc));
+                                 rc);
                        /* not much we can do if it fails anyway, ignore rc */
                        rc = 0;
                } else
                        write_unlock(&GlobalSMBSeslock);
                ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
                if (ptmp) {
-                        cFYI(1, ("closedir free smb buf in srch struct"));
+                        cFYI(1, "closedir free smb buf in srch struct");
                        pCFileStruct->srch_inf.ntwrk_buf_start = NULL;
                        if (pCFileStruct->srch_inf.smallBuf)
                                cifs_small_buf_release(ptmp);
@@ -747,49 +749,49 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        rc = -EACCES;
        xid = GetXid();
-        cFYI(1, ("Lock parm: 0x%x flockflags: "
+        cFYI(1, "Lock parm: 0x%x flockflags: "
                 "0x%x flocktype: 0x%x start: %lld end: %lld",
                cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
-                pfLock->fl_end));
+                pfLock->fl_end);
        if (pfLock->fl_flags & FL_POSIX)
-                cFYI(1, ("Posix"));
+                cFYI(1, "Posix");
        if (pfLock->fl_flags & FL_FLOCK)
-                cFYI(1, ("Flock"));
+                cFYI(1, "Flock");
        if (pfLock->fl_flags & FL_SLEEP) {
-                cFYI(1, ("Blocking lock"));
+                cFYI(1, "Blocking lock");
                wait_flag = true;
        }
        if (pfLock->fl_flags & FL_ACCESS)
-                cFYI(1, ("Process suspended by mandatory locking - "
+                cFYI(1, "Process suspended by mandatory locking - "
-                         "not implemented yet"));
+                         "not implemented yet");
        if (pfLock->fl_flags & FL_LEASE)
-                cFYI(1, ("Lease on file - not implemented yet"));
+                cFYI(1, "Lease on file - not implemented yet");
        if (pfLock->fl_flags &
            (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
-                cFYI(1, ("Unknown lock flags 0x%x", pfLock->fl_flags));
+                cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
        if (pfLock->fl_type == F_WRLCK) {
-                cFYI(1, ("F_WRLCK "));
+                cFYI(1, "F_WRLCK ");
                numLock = 1;
        } else if (pfLock->fl_type == F_UNLCK) {
-                cFYI(1, ("F_UNLCK"));
+                cFYI(1, "F_UNLCK");
                numUnlock = 1;
                /* Check if unlock includes more than
                one lock range */
        } else if (pfLock->fl_type == F_RDLCK) {
-                cFYI(1, ("F_RDLCK"));
+                cFYI(1, "F_RDLCK");
                lockType |= LOCKING_ANDX_SHARED_LOCK;
                numLock = 1;
        } else if (pfLock->fl_type == F_EXLCK) {
-                cFYI(1, ("F_EXLCK"));
+                cFYI(1, "F_EXLCK");
                numLock = 1;
        } else if (pfLock->fl_type == F_SHLCK) {
-                cFYI(1, ("F_SHLCK"));
+                cFYI(1, "F_SHLCK");
                lockType |= LOCKING_ANDX_SHARED_LOCK;
                numLock = 1;
        } else
-                cFYI(1, ("Unknown type of lock"));
+                cFYI(1, "Unknown type of lock");
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        tcon = cifs_sb->tcon;
@@ -832,14 +834,38 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                         0 /* wait flag */ );
                        pfLock->fl_type = F_UNLCK;
                        if (rc != 0)
-                                cERROR(1, ("Error unlocking previously locked "
+                                cERROR(1, "Error unlocking previously locked "
-                                           "range %d during test of lock", rc));
+                                           "range %d during test of lock", rc);
                        rc = 0;
                } else {
                        /* if rc == ERR_SHARING_VIOLATION ? */
-                        rc = 0; /* do not change lock type to unlock
+                        rc = 0;
-                                   since range in use */
+                        if (lockType & LOCKING_ANDX_SHARED_LOCK) {
+                                pfLock->fl_type = F_WRLCK;
+                        } else {
+                                rc = CIFSSMBLock(xid, tcon, netfid, length,
+                                        pfLock->fl_start, 0, 1,
+                                        lockType | LOCKING_ANDX_SHARED_LOCK,
+                                        0 /* wait flag */);
+                                if (rc == 0) {
+                                        rc = CIFSSMBLock(xid, tcon, netfid,
+                                                length, pfLock->fl_start, 1, 0,
+                                                lockType |
+                                                LOCKING_ANDX_SHARED_LOCK,
+                                                0 /* wait flag */);
+                                        pfLock->fl_type = F_RDLCK;
+                                        if (rc != 0)
+                                                cERROR(1, "Error unlocking "
+                                                "previously locked range %d "
+                                                "during test of lock", rc);
+                                        rc = 0;
+                                } else {
+                                        pfLock->fl_type = F_WRLCK;
+                                        rc = 0;
+                                }
+                        }
                }
                FreeXid(xid);
@@ -898,9 +924,10 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                                        1, 0, li->type, false);
                                        if (stored_rc)
                                                rc = stored_rc;
+                                        else {
-                                        list_del(&li->llist);
+                                                list_del(&li->llist);
-                                        kfree(li);
+                                                kfree(li);
+                                        }
                                }
                        }
                        mutex_unlock(&fid->lock_mutex);
@@ -963,9 +990,8 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        pTcon = cifs_sb->tcon;
-        /* cFYI(1,
+        /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
-           (" write %d bytes to offset %lld of %s", write_size,
+           *poffset, file->f_path.dentry->d_name.name); */
-           *poffset, file->f_path.dentry->d_name.name)); */
        if (file->private_data == NULL)
                return -EBADF;
@@ -1066,8 +1092,8 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        pTcon = cifs_sb->tcon;
-        cFYI(1, ("write %zd bytes to offset %lld of %s", write_size,
+        cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
-           *poffset, file->f_path.dentry->d_name.name));
+           *poffset, file->f_path.dentry->d_name.name);
        if (file->private_data == NULL)
                return -EBADF;
@@ -1208,7 +1234,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
        it being zero) during stress testcases so we need to check for it */
        if (cifs_inode == NULL) {
-                cERROR(1, ("Null inode passed to cifs_writeable_file"));
+                cERROR(1, "Null inode passed to cifs_writeable_file");
                dump_stack();
                return NULL;
        }
@@ -1252,7 +1278,7 @@ refind_writable:
                        again. Note that it would be bad
                        to hold up writepages here (rather than
                        in caller) with continuous retries */
-                        cFYI(1, ("wp failed on reopen file"));
+                        cFYI(1, "wp failed on reopen file");
                        read_lock(&GlobalSMBSeslock);
                        /* can not use this handle, no write
                           pending on this one after all */
@@ -1328,7 +1354,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
                else if (bytes_written < 0)
                        rc = bytes_written;
        } else {
-                cFYI(1, ("No writeable filehandles for inode"));
+                cFYI(1, "No writeable filehandles for inode");
                rc = -EIO;
        }
@@ -1500,7 +1526,7 @@ retry:
                         */
                        open_file = find_writable_file(CIFS_I(mapping->host));
                        if (!open_file) {
-                                cERROR(1, ("No writable handles for inode"));
+                                cERROR(1, "No writable handles for inode");
                                rc = -EBADF;
                        } else {
                                long_op = cifs_write_timeout(cifsi, offset);
@@ -1513,8 +1539,8 @@ retry:
                                cifs_update_eof(cifsi, offset, bytes_written);
                                if (rc || bytes_written < bytes_to_write) {
-                                        cERROR(1, ("Write2 ret %d, wrote %d",
+                                        cERROR(1, "Write2 ret %d, wrote %d",
-                                                  rc, bytes_written));
+                                                  rc, bytes_written);
                                        /* BB what if continued retry is
                                           requested via mount flags? */
                                        if (rc == -ENOSPC)
@@ -1575,7 +1601,7 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
 /* BB add check for wbc flags */
        page_cache_get(page);
        if (!PageUptodate(page))
-                cFYI(1, ("ppw - page not up to date"));
+                cFYI(1, "ppw - page not up to date");
        /*
         * Set the "writeback" flag, and clear "dirty" in the radix tree.
@@ -1604,8 +1630,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
        int rc;
        struct inode *inode = mapping->host;
-        cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
+        cFYI(1, "write_end for page %p from pos %lld with %d bytes",
-                 page, pos, copied));
+                 page, pos, copied);
        if (PageChecked(page)) {
                if (copied == len)
@@ -1661,8 +1687,8 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
        xid = GetXid();
-        cFYI(1, ("Sync file - name: %s datasync: 0x%x",
+        cFYI(1, "Sync file - name: %s datasync: 0x%x",
-                dentry->d_name.name, datasync));
+                dentry->d_name.name, datasync);
        rc = filemap_write_and_wait(inode->i_mapping);
        if (rc == 0) {
@@ -1686,7 +1712,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
        unsigned int rpages = 0;
        int rc = 0;
-        cFYI(1, ("sync page %p",page));
+        cFYI(1, "sync page %p", page);
        mapping = page->mapping;
        if (!mapping)
                return 0;
@@ -1697,7 +1723,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 /*      fill in rpages then
        result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
-/*      cFYI(1, ("rpages is %d for sync page of Index %ld", rpages, index));
+/*      cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
 #if 0
        if (rc < 0)
@@ -1731,7 +1757,7 @@ int cifs_flush(struct file *file, fl_owner_t id)
                CIFS_I(inode)->write_behind_rc = 0;
        }
-        cFYI(1, ("Flush inode %p file %p rc %d", inode, file, rc));
+        cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
        return rc;
 }
@@ -1763,7 +1789,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
        open_file = (struct cifsFileInfo *)file->private_data;
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-                cFYI(1, ("attempting read on write only file instance"));
+                cFYI(1, "attempting read on write only file instance");
        for (total_read = 0, current_offset = read_data;
             read_size > total_read;
@@ -1844,7 +1870,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        open_file = (struct cifsFileInfo *)file->private_data;
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-                cFYI(1, ("attempting read on write only file instance"));
+                cFYI(1, "attempting read on write only file instance");
        for (total_read = 0, current_offset = read_data;
             read_size > total_read;
@@ -1890,13 +1916,12 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
-        struct dentry *dentry = file->f_path.dentry;
        int rc, xid;
        xid = GetXid();
-        rc = cifs_revalidate(dentry);
+        rc = cifs_revalidate_file(file);
        if (rc) {
-                cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
+                cFYI(1, "Validation prior to mmap failed, error=%d", rc);
                FreeXid(xid);
                return rc;
        }
@@ -1907,8 +1932,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 static void cifs_copy_cache_pages(struct address_space *mapping,
-        struct list_head *pages, int bytes_read, char *data,
+        struct list_head *pages, int bytes_read, char *data)
-        struct pagevec *plru_pvec)
 {
        struct page *page;
        char *target;
@@ -1920,10 +1944,10 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
                page = list_entry(pages->prev, struct page, lru);
                list_del(&page->lru);
-                if (add_to_page_cache(page, mapping, page->index,
+                if (add_to_page_cache_lru(page, mapping, page->index,
                                      GFP_KERNEL)) {
                        page_cache_release(page);
-                        cFYI(1, ("Add page cache failed"));
+                        cFYI(1, "Add page cache failed");
                        data += PAGE_CACHE_SIZE;
                        bytes_read -= PAGE_CACHE_SIZE;
                        continue;
@@ -1946,8 +1970,6 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
                flush_dcache_page(page);
                SetPageUptodate(page);
                unlock_page(page);
-                if (!pagevec_add(plru_pvec, page))
-                        __pagevec_lru_add_file(plru_pvec);
                data += PAGE_CACHE_SIZE;
        }
        return;
@@ -1966,7 +1988,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        unsigned int read_size, i;
        char *smb_read_data = NULL;
        struct smb_com_read_rsp *pSMBr;
-        struct pagevec lru_pvec;
        struct cifsFileInfo *open_file;
        int buf_type = CIFS_NO_BUFFER;
@@ -1980,8 +2001,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        pTcon = cifs_sb->tcon;
-        pagevec_init(&lru_pvec, 0);
+        cFYI(DBG2, "rpages: num pages %d", num_pages);
-        cFYI(DBG2, ("rpages: num pages %d", num_pages));
        for (i = 0; i < num_pages; ) {
                unsigned contig_pages;
                struct page *tmp_page;
@@ -2014,8 +2034,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                /* Read size needs to be in multiples of one page */
                read_size = min_t(const unsigned int, read_size,
                                  cifs_sb->rsize & PAGE_CACHE_MASK);
-                cFYI(DBG2, ("rpages: read size 0x%x  contiguous pages %d",
+                cFYI(DBG2, "rpages: read size 0x%x  contiguous pages %d",
-                                read_size, contig_pages));
+                                read_size, contig_pages);
                rc = -EAGAIN;
                while (rc == -EAGAIN) {
                        if ((open_file->invalidHandle) &&
@@ -2042,14 +2062,14 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                        }
                }
                if ((rc < 0) || (smb_read_data == NULL)) {
-                        cFYI(1, ("Read error in readpages: %d", rc));
+                        cFYI(1, "Read error in readpages: %d", rc);
                        break;
                } else if (bytes_read > 0) {
                        task_io_account_read(bytes_read);
                        pSMBr = (struct smb_com_read_rsp *)smb_read_data;
                        cifs_copy_cache_pages(mapping, page_list, bytes_read,
                                smb_read_data + 4 /* RFC1001 hdr */ +
-                                le16_to_cpu(pSMBr->DataOffset), &lru_pvec);
+                                le16_to_cpu(pSMBr->DataOffset));
                        i +=  bytes_read >> PAGE_CACHE_SHIFT;
                        cifs_stats_bytes_read(pTcon, bytes_read);
@@ -2065,9 +2085,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                                /* break; */
                        }
                } else {
-                        cFYI(1, ("No bytes read (%d) at offset %lld . "
+                        cFYI(1, "No bytes read (%d) at offset %lld . "
-                                 "Cleaning remaining pages from readahead list",
+                                "Cleaning remaining pages from readahead list",
-                                 bytes_read, offset));
+                                bytes_read, offset);
                        /* BB turn off caching and do new lookup on
                           file size at server? */
                        break;
@@ -2082,8 +2102,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                bytes_read = 0;
        }
-        pagevec_lru_add_file(&lru_pvec);
 /* need to free smb_read_data buf before exit */
        if (smb_read_data) {
                if (buf_type == CIFS_SMALL_BUFFER)
@@ -2112,7 +2130,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
        if (rc < 0)
                goto io_error;
        else
-                cFYI(1, ("Bytes read %d", rc));
+                cFYI(1, "Bytes read %d", rc);
        file->f_path.dentry->d_inode->i_atime =
                current_fs_time(file->f_path.dentry->d_inode->i_sb);
@@ -2144,8 +2162,8 @@ static int cifs_readpage(struct file *file, struct page *page)
                return rc;
        }
-        cFYI(1, ("readpage %p at offset %d 0x%x\n",
+        cFYI(1, "readpage %p at offset %d 0x%x\n",
-                 page, (int)offset, (int)offset));
+                 page, (int)offset, (int)offset);
        rc = cifs_readpage_worker(file, page, &offset);
@@ -2215,7 +2233,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
        struct page *page;
        int rc = 0;
-        cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
+        cFYI(1, "write_begin from %lld len %d", (long long)pos, len);
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
@@ -2287,12 +2305,10 @@ cifs_oplock_break(struct slow_work *work)
        int rc, waitrc = 0;
        if (inode && S_ISREG(inode->i_mode)) {
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+                if (cinode->clientCanCacheRead)
-                if (cinode->clientCanCacheAll == 0)
                        break_lease(inode, O_RDONLY);
-                else if (cinode->clientCanCacheRead == 0)
+                else
                        break_lease(inode, O_WRONLY);
-#endif
                rc = filemap_fdatawrite(inode->i_mapping);
                if (cinode->clientCanCacheRead == 0) {
                        waitrc = filemap_fdatawait(inode->i_mapping);
@@ -2302,7 +2318,7 @@ cifs_oplock_break(struct slow_work *work)
                        rc = waitrc;
                if (rc)
                        cinode->write_behind_rc = rc;
-                cFYI(1, ("Oplock flush inode %p rc %d", inode, rc));
+                cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
        }
        /*
@@ -2314,7 +2330,7 @@ cifs_oplock_break(struct slow_work *work)
        if (!cfile->closePend && !cfile->oplock_break_cancelled) {
                rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
                                 LOCKING_ANDX_OPLOCK_RELEASE, false);
-                cFYI(1, ("Oplock release rc = %d", rc));
+                cFYI(1, "Oplock release rc = %d", rc);
        }
 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8bdbc818164c..62b324f26a56 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/inode.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -20,6 +20,7 @@
 */
 #include <linux/fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
@@ -77,6 +78,41 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
        }
 }
+/* check inode attributes against fattr. If they don't match, tag the
+ * inode for cache invalidation
+ */
+static void
+cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
+{
+        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+        cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid);
+        if (inode->i_state & I_NEW) {
+                cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid);
+                return;
+        }
+        /* don't bother with revalidation if we have an oplock */
+        if (cifs_i->clientCanCacheRead) {
+                cFYI(1, "%s: inode %llu is oplocked", __func__,
+                         cifs_i->uniqueid);
+                return;
+        }
+         /* revalidate if mtime or size have changed */
+        if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
+            cifs_i->server_eof == fattr->cf_eof) {
+                cFYI(1, "%s: inode %llu is unchanged", __func__,
+                         cifs_i->uniqueid);
+                return;
+        }
+        cFYI(1, "%s: invalidating inode %llu mapping", __func__,
+                 cifs_i->uniqueid);
+        cifs_i->invalid_mapping = true;
+}
 /* populate an inode with info from a cifs_fattr struct */
 void
 cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -85,6 +121,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        unsigned long oldtime = cifs_i->time;
+        cifs_revalidate_cache(inode, fattr);
        inode->i_atime = fattr->cf_atime;
        inode->i_mtime = fattr->cf_mtime;
        inode->i_ctime = fattr->cf_ctime;
@@ -99,15 +137,14 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
                inode->i_mode = fattr->cf_mode;
        cifs_i->cifsAttrs = fattr->cf_cifsattrs;
-        cifs_i->uniqueid = fattr->cf_uniqueid;
        if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
                cifs_i->time = 0;
        else
                cifs_i->time = jiffies;
-        cFYI(1, ("inode 0x%p old_time=%ld new_time=%ld", inode,
+        cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode,
-                 oldtime, cifs_i->time));
+                 oldtime, cifs_i->time);
        cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
@@ -132,6 +169,17 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
        cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
 }
+void
+cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr)
+{
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+                return;
+        fattr->cf_uniqueid = iunique(sb, ROOT_I);
+}
 /* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */
 void
 cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
@@ -189,7 +237,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
                /* safest to call it a file if we do not know */
                fattr->cf_mode |= S_IFREG;
                fattr->cf_dtype = DT_REG;
-                cFYI(1, ("unknown type %d", le32_to_cpu(info->Type)));
+                cFYI(1, "unknown type %d", le32_to_cpu(info->Type));
                break;
        }
@@ -218,7 +266,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        cFYI(1, ("creating fake fattr for DFS referral"));
+        cFYI(1, "creating fake fattr for DFS referral");
        memset(fattr, 0, sizeof(*fattr));
        fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
@@ -231,6 +279,31 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
        fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
 }
+int cifs_get_file_info_unix(struct file *filp)
+{
+        int rc;
+        int xid;
+        FILE_UNIX_BASIC_INFO find_data;
+        struct cifs_fattr fattr;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+        xid = GetXid();
+        rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
+        if (!rc) {
+                cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
+        } else if (rc == -EREMOTE) {
+                cifs_create_dfs_fattr(&fattr, inode->i_sb);
+                rc = 0;
+        }
+        cifs_fattr_to_inode(inode, &fattr);
+        FreeXid(xid);
+        return rc;
+}
 int cifs_get_inode_info_unix(struct inode **pinode,
                             const unsigned char *full_path,
                             struct super_block *sb, int xid)
@@ -242,7 +315,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        tcon = cifs_sb->tcon;
-        cFYI(1, ("Getting info on %s", full_path));
+        cFYI(1, "Getting info on %s", full_path);
        /* could have done a find first instead but this returns more info */
        rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
@@ -260,6 +333,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        if (*pinode == NULL) {
                /* get new inode */
+                cifs_fill_uniqueid(sb, &fattr);
                *pinode = cifs_iget(sb, &fattr);
                if (!*pinode)
                        rc = -ENOMEM;
@@ -310,7 +384,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                                 &bytes_read, &pbuf, &buf_type);
                if ((rc == 0) && (bytes_read >= 8)) {
                        if (memcmp("IntxBLK", pbuf, 8) == 0) {
-                                cFYI(1, ("Block device"));
+                                cFYI(1, "Block device");
                                fattr->cf_mode |= S_IFBLK;
                                fattr->cf_dtype = DT_BLK;
                                if (bytes_read == 24) {
@@ -322,7 +396,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                                        fattr->cf_rdev = MKDEV(mjr, mnr);
                                }
                        } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
-                                cFYI(1, ("Char device"));
+                                cFYI(1, "Char device");
                                fattr->cf_mode |= S_IFCHR;
                                fattr->cf_dtype = DT_CHR;
                                if (bytes_read == 24) {
@@ -334,7 +408,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                                        fattr->cf_rdev = MKDEV(mjr, mnr);
                                }
                        } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
-                                cFYI(1, ("Symlink"));
+                                cFYI(1, "Symlink");
                                fattr->cf_mode |= S_IFLNK;
                                fattr->cf_dtype = DT_LNK;
                        } else {
@@ -376,10 +450,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
        else if (rc > 3) {
                mode = le32_to_cpu(*((__le32 *)ea_value));
                fattr->cf_mode &= ~SFBITS_MASK;
-                cFYI(1, ("special bits 0%o org mode 0%o", mode,
+                cFYI(1, "special bits 0%o org mode 0%o", mode,
-                         fattr->cf_mode));
+                         fattr->cf_mode);
                fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
-                cFYI(1, ("special mode bits 0%o", mode));
+                cFYI(1, "special mode bits 0%o", mode);
        }
        return 0;
@@ -432,6 +506,47 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
        fattr->cf_gid = cifs_sb->mnt_gid;
 }
+int cifs_get_file_info(struct file *filp)
+{
+        int rc;
+        int xid;
+        FILE_ALL_INFO find_data;
+        struct cifs_fattr fattr;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+        xid = GetXid();
+        rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
+        if (rc == -EOPNOTSUPP || rc == -EINVAL) {
+                /*
+                 * FIXME: legacy server -- fall back to path-based call?
+                 * for now, just skip revalidating and mark inode for
+                 * immediate reval.
+                 */
+                rc = 0;
+                CIFS_I(inode)->time = 0;
+                goto cgfi_exit;
+        } else if (rc == -EREMOTE) {
+                cifs_create_dfs_fattr(&fattr, inode->i_sb);
+                rc = 0;
+        } else if (rc)
+                goto cgfi_exit;
+        /*
+         * don't bother with SFU junk here -- just mark inode as needing
+         * revalidation.
+         */
+        cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
+        fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
+        fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
+        cifs_fattr_to_inode(inode, &fattr);
+cgfi_exit:
+        FreeXid(xid);
+        return rc;
+}
 int cifs_get_inode_info(struct inode **pinode,
        const unsigned char *full_path, FILE_ALL_INFO *pfindData,
        struct super_block *sb, int xid, const __u16 *pfid)
@@ -444,11 +559,11 @@ int cifs_get_inode_info(struct inode **pinode,
        struct cifs_fattr fattr;
        pTcon = cifs_sb->tcon;
-        cFYI(1, ("Getting info on %s", full_path));
+        cFYI(1, "Getting info on %s", full_path);
        if ((pfindData == NULL) && (*pinode != NULL)) {
                if (CIFS_I(*pinode)->clientCanCacheRead) {
-                        cFYI(1, ("No need to revalidate cached inode sizes"));
+                        cFYI(1, "No need to revalidate cached inode sizes");
                        return rc;
                }
        }
@@ -514,7 +629,7 @@ int cifs_get_inode_info(struct inode **pinode,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                        if (rc1 || !fattr.cf_uniqueid) {
-                                cFYI(1, ("GetSrvInodeNum rc %d", rc1));
+                                cFYI(1, "GetSrvInodeNum rc %d", rc1);
                                fattr.cf_uniqueid = iunique(sb, ROOT_I);
                                cifs_autodisable_serverino(cifs_sb);
                        }
@@ -530,13 +645,13 @@ int cifs_get_inode_info(struct inode **pinode,
            cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
                tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
                if (tmprc)
-                        cFYI(1, ("cifs_sfu_type failed: %d", tmprc));
+                        cFYI(1, "cifs_sfu_type failed: %d", tmprc);
        }
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        /* fill in 0777 bits from ACL */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                cFYI(1, ("Getting mode bits from ACL"));
+                cFYI(1, "Getting mode bits from ACL");
                cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
        }
 #endif
@@ -611,6 +726,16 @@ cifs_find_inode(struct inode *inode, void *opaque)
        if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
                return 0;
+        /*
+         * uh oh -- it's a directory. We can't use it since hardlinked dirs are
+         * verboten. Disable serverino and return it as if it were found, the
+         * caller can discard it, generate a uniqueid and retry the find
+         */
+        if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
+                fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
+                cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
+        }
        return 1;
 }
@@ -630,15 +755,22 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
        unsigned long hash;
        struct inode *inode;
-        cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid));
+retry_iget5_locked:
+        cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid);
        /* hash down to 32-bits on 32-bit arch */
        hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
        inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
-        /* we have fattrs in hand, update the inode */
        if (inode) {
+                /* was there a problematic inode number collision? */
+                if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
+                        iput(inode);
+                        fattr->cf_uniqueid = iunique(sb, ROOT_I);
+                        fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
+                        goto retry_iget5_locked;
+                }
                cifs_fattr_to_inode(inode, fattr);
                if (sb->s_flags & MS_NOATIME)
                        inode->i_flags |= S_NOATIME | S_NOCMTIME;
@@ -676,7 +808,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
                return ERR_PTR(-ENOMEM);
        if (rc && cifs_sb->tcon->ipc) {
-                cFYI(1, ("ipc connection - fake read inode"));
+                cFYI(1, "ipc connection - fake read inode");
                inode->i_mode |= S_IFDIR;
                inode->i_nlink = 2;
                inode->i_op = &cifs_ipc_inode_ops;
@@ -738,7 +870,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
         * server times.
         */
        if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
-                cFYI(1, ("CIFS - CTIME changed"));
+                cFYI(1, "CIFS - CTIME changed");
                info_buf.ChangeTime =
                    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
        } else
@@ -773,8 +905,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
                        goto out;
        }
-        cFYI(1, ("calling SetFileInfo since SetPathInfo for "
+        cFYI(1, "calling SetFileInfo since SetPathInfo for "
-                 "times not supported by this server"));
+                 "times not supported by this server");
        rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
                         SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
                         CREATE_NOT_DIR, &netfid, &oplock,
@@ -932,7 +1064,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
        struct iattr *attrs = NULL;
        __u32 dosattr = 0, origattr = 0;
-        cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
+        cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
        xid = GetXid();
@@ -951,7 +1083,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
                rc = CIFSPOSIXDelFile(xid, tcon, full_path,
                        SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-                cFYI(1, ("posix del rc %d", rc));
+                cFYI(1, "posix del rc %d", rc);
                if ((rc == 0) || (rc == -ENOENT))
                        goto psx_del_no_retry;
        }
@@ -1025,7 +1157,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        struct inode *newinode = NULL;
        struct cifs_fattr fattr;
-        cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode));
+        cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
        xid = GetXid();
@@ -1060,7 +1192,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        kfree(pInfo);
                        goto mkdir_retry_old;
                } else if (rc) {
-                        cFYI(1, ("posix mkdir returned 0x%x", rc));
+                        cFYI(1, "posix mkdir returned 0x%x", rc);
                        d_drop(direntry);
                } else {
                        if (pInfo->Type == cpu_to_le32(-1)) {
@@ -1077,6 +1209,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                                direntry->d_op = &cifs_dentry_ops;
                        cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
+                        cifs_fill_uniqueid(inode->i_sb, &fattr);
                        newinode = cifs_iget(inode->i_sb, &fattr);
                        if (!newinode) {
                                kfree(pInfo);
@@ -1086,12 +1219,12 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        d_instantiate(direntry, newinode);
 #ifdef CONFIG_CIFS_DEBUG2
-                        cFYI(1, ("instantiated dentry %p %s to inode %p",
+                        cFYI(1, "instantiated dentry %p %s to inode %p",
-                                direntry, direntry->d_name.name, newinode));
+                                direntry, direntry->d_name.name, newinode);
                        if (newinode->i_nlink != 2)
-                                cFYI(1, ("unexpected number of links %d",
+                                cFYI(1, "unexpected number of links %d",
-                                        newinode->i_nlink));
+                                        newinode->i_nlink);
 #endif
                }
                kfree(pInfo);
@@ -1102,7 +1235,7 @@ mkdir_retry_old:
        rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
                          cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                cFYI(1, ("cifs_mkdir returned 0x%x", rc));
+                cFYI(1, "cifs_mkdir returned 0x%x", rc);
                d_drop(direntry);
        } else {
 mkdir_get_info:
@@ -1205,7 +1338,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        char *full_path = NULL;
        struct cifsInodeInfo *cifsInode;
-        cFYI(1, ("cifs_rmdir, inode = 0x%p", inode));
+        cFYI(1, "cifs_rmdir, inode = 0x%p", inode);
        xid = GetXid();
@@ -1389,135 +1522,108 @@ cifs_rename_exit:
        return rc;
 }
-int cifs_revalidate(struct dentry *direntry)
+static bool
+cifs_inode_needs_reval(struct inode *inode)
 {
-        int xid;
+        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
-        int rc = 0, wbrc = 0;
-        char *full_path;
-        struct cifs_sb_info *cifs_sb;
-        struct cifsInodeInfo *cifsInode;
-        loff_t local_size;
-        struct timespec local_mtime;
-        bool invalidate_inode = false;
-        if (direntry->d_inode == NULL)
+        if (cifs_i->clientCanCacheRead)
-                return -ENOENT;
+                return false;
-        cifsInode = CIFS_I(direntry->d_inode);
+        if (!lookupCacheEnabled)
+                return true;
-        if (cifsInode == NULL)
+        if (cifs_i->time == 0)
-                return -ENOENT;
+                return true;
-        /* no sense revalidating inode info on file that no one can write */
+        /* FIXME: the actimeo should be tunable */
-        if (CIFS_I(direntry->d_inode)->clientCanCacheRead)
+        if (time_after_eq(jiffies, cifs_i->time + HZ))
-                return rc;
+                return true;
+        /* hardlinked files w/ noserverino get "special" treatment */
+        if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+            S_ISREG(inode->i_mode) && inode->i_nlink != 1)
+                return true;
+        return false;
+}
+/* check invalid_mapping flag and zap the cache if it's set */
+static void
+cifs_invalidate_mapping(struct inode *inode)
+{
+        int rc;
+        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+        cifs_i->invalid_mapping = false;
+        /* write back any cached data */
+        if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
+                rc = filemap_write_and_wait(inode->i_mapping);
+                if (rc)
+                        cifs_i->write_behind_rc = rc;
+        }
+        invalidate_remote_inode(inode);
+}
+int cifs_revalidate_file(struct file *filp)
+{
+        int rc = 0;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        if (!cifs_inode_needs_reval(inode))
+                goto check_inval;
+        if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
+                rc = cifs_get_file_info_unix(filp);
+        else
+                rc = cifs_get_file_info(filp);
+check_inval:
+        if (CIFS_I(inode)->invalid_mapping)
+                cifs_invalidate_mapping(inode);
+        return rc;
+}
+/* revalidate a dentry's inode attributes */
+int cifs_revalidate_dentry(struct dentry *dentry)
+{
+        int xid;
+        int rc = 0;
+        char *full_path = NULL;
+        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = dentry->d_sb;
+        if (inode == NULL)
+                return -ENOENT;
        xid = GetXid();
-        cifs_sb = CIFS_SB(direntry->d_sb);
+        if (!cifs_inode_needs_reval(inode))
+                goto check_inval;
        /* can not safely grab the rename sem here if rename calls revalidate
           since that would deadlock */
-        full_path = build_path_from_dentry(direntry);
+        full_path = build_path_from_dentry(dentry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto check_inval;
-                return rc;
-        }
-        cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
-                 "jiffies %ld", full_path, direntry->d_inode,
-                 direntry->d_inode->i_count.counter, direntry,
-                 direntry->d_time, jiffies));
-        if (cifsInode->time == 0) {
-                /* was set to zero previously to force revalidate */
-        } else if (time_before(jiffies, cifsInode->time + HZ) &&
-                   lookupCacheEnabled) {
-                if ((S_ISREG(direntry->d_inode->i_mode) == 0) ||
-                    (direntry->d_inode->i_nlink == 1)) {
-                        kfree(full_path);
-                        FreeXid(xid);
-                        return rc;
-                } else {
-                        cFYI(1, ("Have to revalidate file due to hardlinks"));
-                }
        }
-        /* save mtime and size */
+        cFYI(1, "Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
-        local_mtime = direntry->d_inode->i_mtime;
+                 "jiffies %ld", full_path, inode, inode->i_count.counter,
-        local_size = direntry->d_inode->i_size;
+                 dentry, dentry->d_time, jiffies);
-        if (cifs_sb->tcon->unix_ext) {
+        if (CIFS_SB(sb)->tcon->unix_ext)
-                rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path,
+                rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
-                                              direntry->d_sb, xid);
+        else
-                if (rc) {
+                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
-                        cFYI(1, ("error on getting revalidate info %d", rc));
+                                         xid, NULL);
-/*                      if (rc != -ENOENT)
-                                rc = 0; */      /* BB should we cache info on
-                                                   certain errors? */
-                }
-        } else {
-                rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
-                                         direntry->d_sb, xid, NULL);
-                if (rc) {
-                        cFYI(1, ("error on getting revalidate info %d", rc));
-/*                      if (rc != -ENOENT)
-                                rc = 0; */      /* BB should we cache info on
-                                                   certain errors? */
-                }
-        }
-        /* should we remap certain errors, access denied?, to zero */
-        /* if not oplocked, we invalidate inode pages if mtime or file size
-           had changed on server */
-        if (timespec_equal(&local_mtime, &direntry->d_inode->i_mtime) &&
-            (local_size == direntry->d_inode->i_size)) {
-                cFYI(1, ("cifs_revalidate - inode unchanged"));
-        } else {
-                /* file may have changed on server */
-                if (cifsInode->clientCanCacheRead) {
-                        /* no need to invalidate inode pages since we were the
-                           only ones who could have modified the file and the
-                           server copy is staler than ours */
-                } else {
-                        invalidate_inode = true;
-                }
-        }
-        /* can not grab this sem since kernel filesys locking documentation
+check_inval:
-           indicates i_mutex may be taken by the kernel on lookup and rename
+        if (CIFS_I(inode)->invalid_mapping)
-           which could deadlock if we grab the i_mutex here as well */
+                cifs_invalidate_mapping(inode);
-/*      mutex_lock(&direntry->d_inode->i_mutex);*/
-        /* need to write out dirty pages here  */
-        if (direntry->d_inode->i_mapping) {
-                /* do we need to lock inode until after invalidate completes
-                   below? */
-                wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
-                if (wbrc)
-                        CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
-        }
-        if (invalidate_inode) {
-        /* shrink_dcache not necessary now that cifs dentry ops
-        are exported for negative dentries */
-/*              if (S_ISDIR(direntry->d_inode->i_mode))
-                        shrink_dcache_parent(direntry); */
-                if (S_ISREG(direntry->d_inode->i_mode)) {
-                        if (direntry->d_inode->i_mapping) {
-                                wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
-                                if (wbrc)
-                                        CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
-                        }
-                        /* may eventually have to do this for open files too */
-                        if (list_empty(&(cifsInode->openFileList))) {
-                                /* changed on server - flush read ahead pages */
-                                cFYI(1, ("Invalidating read ahead data on "
-                                         "closed file"));
-                                invalidate_remote_inode(direntry->d_inode);
-                        }
-                }
-        }
-/*      mutex_unlock(&direntry->d_inode->i_mutex); */
        kfree(full_path);
        FreeXid(xid);
@@ -1527,7 +1633,7 @@ int cifs_revalidate(struct dentry *direntry)
 int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        struct kstat *stat)
 {
-        int err = cifs_revalidate(dentry);
+        int err = cifs_revalidate_dentry(dentry);
        if (!err) {
                generic_fillattr(dentry->d_inode, stat);
                stat->blksize = CIFS_MAX_MSGSIZE;
@@ -1601,12 +1707,12 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
                                        npid, false);
                cifsFileInfo_put(open_file);
-                cFYI(1, ("SetFSize for attrs rc = %d", rc));
+                cFYI(1, "SetFSize for attrs rc = %d", rc);
                if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        unsigned int bytes_written;
                        rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
                                          &bytes_written, NULL, NULL, 1);
-                        cFYI(1, ("Wrt seteof rc %d", rc));
+                        cFYI(1, "Wrt seteof rc %d", rc);
                }
        } else
                rc = -EINVAL;
@@ -1620,7 +1726,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                                   false, cifs_sb->local_nls,
                                   cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
+                cFYI(1, "SetEOF by path (setattrs) rc = %d", rc);
                if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        __u16 netfid;
                        int oplock = 0;
@@ -1637,7 +1743,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                                                  attrs->ia_size,
                                                  &bytes_written, NULL,
                                                  NULL, 1);
-                                cFYI(1, ("wrt seteof rc %d", rc));
+                                cFYI(1, "wrt seteof rc %d", rc);
                                CIFSSMBClose(xid, pTcon, netfid);
                        }
                }
@@ -1665,8 +1771,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
        struct cifs_unix_set_info_args *args = NULL;
        struct cifsFileInfo *open_file;
-        cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x",
+        cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x",
-                 direntry->d_name.name, attrs->ia_valid));
+                 direntry->d_name.name, attrs->ia_valid);
        xid = GetXid();
@@ -1796,8 +1902,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        xid = GetXid();
-        cFYI(1, ("setattr on file %s attrs->iavalid 0x%x",
+        cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
-                 direntry->d_name.name, attrs->ia_valid));
+                 direntry->d_name.name, attrs->ia_valid);
        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
                /* check if we have permission to change attrs */
@@ -1854,7 +1960,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
                attrs->ia_valid &= ~ATTR_MODE;
        if (attrs->ia_valid & ATTR_MODE) {
-                cFYI(1, ("Mode changed to 0%o", attrs->ia_mode));
+                cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
                mode = attrs->ia_mode;
        }
@@ -1940,7 +2046,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #if 0
 void cifs_delete_inode(struct inode *inode)
 {
-        cFYI(1, ("In cifs_delete_inode, inode = 0x%p", inode));
+        cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode);
        /* may have to add back in if and when safe distributed caching of
           directories added e.g. via FindNotify */
 }
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index f94650683a00..505926f1ee6b 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -47,7 +47,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        xid = GetXid();
-        cFYI(1, ("ioctl file %p  cmd %u  arg %lu", filep, command, arg));
+        cFYI(1, "ioctl file %p  cmd %u  arg %lu", filep, command, arg);
        cifs_sb = CIFS_SB(inode->i_sb);
@@ -64,12 +64,12 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        switch (command) {
                case CIFS_IOC_CHECKUMOUNT:
-                        cFYI(1, ("User unmount attempted"));
+                        cFYI(1, "User unmount attempted");
                        if (cifs_sb->mnt_uid == current_uid())
                                rc = 0;
                        else {
                                rc = -EACCES;
-                                cFYI(1, ("uids do not match"));
+                                cFYI(1, "uids do not match");
                        }
                        break;
 #ifdef CONFIG_CIFS_POSIX
@@ -97,11 +97,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                                /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
                                        extAttrBits, &ExtAttrMask);*/
                        }
-                        cFYI(1, ("set flags not implemented yet"));
+                        cFYI(1, "set flags not implemented yet");
                        break;
 #endif /* CONFIG_CIFS_POSIX */
                default:
-                        cFYI(1, ("unsupported ioctl"));
+                        cFYI(1, "unsupported ioctl");
                        break;
        }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fc1e0487eaee..473ca8033656 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -20,6 +20,7 @@
 */
 #include <linux/fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -138,7 +139,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
        if (!full_path)
                goto out;
-        cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode));
+        cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
        rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
                                     cifs_sb->local_nls);
@@ -177,8 +178,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                return rc;
        }
-        cFYI(1, ("Full path: %s", full_path));
+        cFYI(1, "Full path: %s", full_path);
-        cFYI(1, ("symname is %s", symname));
+        cFYI(1, "symname is %s", symname);
        /* BB what if DFS and this volume is on different share? BB */
        if (pTcon->unix_ext)
@@ -197,8 +198,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                                                 inode->i_sb, xid, NULL);
                if (rc != 0) {
-                        cFYI(1, ("Create symlink ok, getinodeinfo fail rc = %d",
+                        cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
-                              rc));
+                              rc);
                } else {
                        if (pTcon->nocase)
                                direntry->d_op = &cifs_ci_dentry_ops;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d1474996a812..1394aa37f26c 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -51,7 +51,7 @@ _GetXid(void)
        if (GlobalTotalActiveXid > GlobalMaxActiveXid)
                GlobalMaxActiveXid = GlobalTotalActiveXid;
        if (GlobalTotalActiveXid > 65000)
-                cFYI(1, ("warning: more than 65000 requests active"));
+                cFYI(1, "warning: more than 65000 requests active");
        xid = GlobalCurrentXid++;
        spin_unlock(&GlobalMid_Lock);
        return xid;
@@ -88,7 +88,7 @@ void
 sesInfoFree(struct cifsSesInfo *buf_to_free)
 {
        if (buf_to_free == NULL) {
-                cFYI(1, ("Null buffer passed to sesInfoFree"));
+                cFYI(1, "Null buffer passed to sesInfoFree");
                return;
        }
@@ -126,7 +126,7 @@ void
 tconInfoFree(struct cifsTconInfo *buf_to_free)
 {
        if (buf_to_free == NULL) {
-                cFYI(1, ("Null buffer passed to tconInfoFree"));
+                cFYI(1, "Null buffer passed to tconInfoFree");
                return;
        }
        atomic_dec(&tconInfoAllocCount);
@@ -166,7 +166,7 @@ void
 cifs_buf_release(void *buf_to_free)
 {
        if (buf_to_free == NULL) {
-                /* cFYI(1, ("Null buffer passed to cifs_buf_release"));*/
+                /* cFYI(1, "Null buffer passed to cifs_buf_release");*/
                return;
        }
        mempool_free(buf_to_free, cifs_req_poolp);
@@ -202,7 +202,7 @@ cifs_small_buf_release(void *buf_to_free)
 {
        if (buf_to_free == NULL) {
-                cFYI(1, ("Null buffer passed to cifs_small_buf_release"));
+                cFYI(1, "Null buffer passed to cifs_small_buf_release");
                return;
        }
        mempool_free(buf_to_free, cifs_sm_req_poolp);
@@ -345,19 +345,19 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                /*      with userid/password pairs found on the smb session   */
                /*      for other target tcp/ip addresses               BB    */
                                if (current_fsuid() != treeCon->ses->linux_uid) {
-                                        cFYI(1, ("Multiuser mode and UID "
+                                        cFYI(1, "Multiuser mode and UID "
-                                                 "did not match tcon uid"));
+                                                 "did not match tcon uid");
                                        read_lock(&cifs_tcp_ses_lock);
                                        list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
                                                ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
                                                if (ses->linux_uid == current_fsuid()) {
                                                        if (ses->server == treeCon->ses->server) {
-                                                                cFYI(1, ("found matching uid substitute right smb_uid"));
+                                                                cFYI(1, "found matching uid substitute right smb_uid");
                                                                buffer->Uid = ses->Suid;
                                                                break;
                                                        } else {
                                /* BB eventually call cifs_setup_session here */
-                                                                cFYI(1, ("local UID found but no smb sess with this server exists"));
+                                                                cFYI(1, "local UID found but no smb sess with this server exists");
                                                        }
                                                }
                                        }
@@ -394,17 +394,16 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid)
                        if (smb->Command == SMB_COM_LOCKING_ANDX)
                                return 0;
                        else
-                                cERROR(1, ("Received Request not response"));
+                                cERROR(1, "Received Request not response");
                }
        } else { /* bad signature or mid */
                if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
-                        cERROR(1,
+                        cERROR(1, "Bad protocol string signature header %x",
-                               ("Bad protocol string signature header %x",
+                                *(unsigned int *) smb->Protocol);
-                                *(unsigned int *) smb->Protocol));
                if (mid != smb->Mid)
-                        cERROR(1, ("Mids do not match"));
+                        cERROR(1, "Mids do not match");
        }
-        cERROR(1, ("bad smb detected. The Mid=%d", smb->Mid));
+        cERROR(1, "bad smb detected. The Mid=%d", smb->Mid);
        return 1;
 }
@@ -413,7 +412,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 {
        __u32 len = smb->smb_buf_length;
        __u32 clc_len;  /* calculated length */
-        cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len));
+        cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
        if (length < 2 + sizeof(struct smb_hdr)) {
                if ((length >= sizeof(struct smb_hdr) - 1)
@@ -437,15 +436,15 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                                tmp[sizeof(struct smb_hdr)+1] = 0;
                                return 0;
                        }
-                        cERROR(1, ("rcvd invalid byte count (bcc)"));
+                        cERROR(1, "rcvd invalid byte count (bcc)");
                } else {
-                        cERROR(1, ("Length less than smb header size"));
+                        cERROR(1, "Length less than smb header size");
                }
                return 1;
        }
        if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-                cERROR(1, ("smb length greater than MaxBufSize, mid=%d",
+                cERROR(1, "smb length greater than MaxBufSize, mid=%d",
-                                   smb->Mid));
+                                   smb->Mid);
                return 1;
        }
@@ -454,8 +453,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
        clc_len = smbCalcSize_LE(smb);
        if (4 + len != length) {
-                cERROR(1, ("Length read does not match RFC1001 length %d",
+                cERROR(1, "Length read does not match RFC1001 length %d",
-                           len));
+                           len);
                return 1;
        }
@@ -466,8 +465,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                        if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
                                return 0; /* bcc wrapped */
                }
-                cFYI(1, ("Calculated size %d vs length %d mismatch for mid %d",
+                cFYI(1, "Calculated size %d vs length %d mismatch for mid %d",
-                                clc_len, 4 + len, smb->Mid));
+                                clc_len, 4 + len, smb->Mid);
                /* Windows XP can return a few bytes too much, presumably
                an illegal pad, at the end of byte range lock responses
                so we allow for that three byte pad, as long as actual
@@ -482,8 +481,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                if ((4+len > clc_len) && (len <= clc_len + 512))
                        return 0;
                else {
-                        cERROR(1, ("RFC1001 size %d bigger than SMB for Mid=%d",
+                        cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
-                                        len, smb->Mid));
+                                        len, smb->Mid);
                        return 1;
                }
        }
@@ -501,7 +500,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
        struct cifsFileInfo *netfile;
        int rc;
-        cFYI(1, ("Checking for oplock break or dnotify response"));
+        cFYI(1, "Checking for oplock break or dnotify response");
        if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
           (pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
                struct smb_com_transaction_change_notify_rsp *pSMBr =
@@ -513,15 +512,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                        pnotify = (struct file_notify_information *)
                                ((char *)&pSMBr->hdr.Protocol + data_offset);
-                        cFYI(1, ("dnotify on %s Action: 0x%x",
+                        cFYI(1, "dnotify on %s Action: 0x%x",
-                                 pnotify->FileName, pnotify->Action));
+                                 pnotify->FileName, pnotify->Action);
                        /*   cifs_dump_mem("Rcvd notify Data: ",buf,
                                sizeof(struct smb_hdr)+60); */
                        return true;
                }
                if (pSMBr->hdr.Status.CifsError) {
-                        cFYI(1, ("notify err 0x%d",
+                        cFYI(1, "notify err 0x%d",
-                                pSMBr->hdr.Status.CifsError));
+                                pSMBr->hdr.Status.CifsError);
                        return true;
                }
                return false;
@@ -535,7 +534,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                   large dirty files cached on the client */
                if ((NT_STATUS_INVALID_HANDLE) ==
                   le32_to_cpu(pSMB->hdr.Status.CifsError)) {
-                        cFYI(1, ("invalid handle on oplock break"));
+                        cFYI(1, "invalid handle on oplock break");
                        return true;
                } else if (ERRbadfid ==
                   le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
@@ -547,8 +546,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
        if (pSMB->hdr.WordCount != 8)
                return false;
-        cFYI(1, ("oplock type 0x%d level 0x%d",
+        cFYI(1, "oplock type 0x%d level 0x%d",
-                 pSMB->LockType, pSMB->OplockLevel));
+                 pSMB->LockType, pSMB->OplockLevel);
        if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
                return false;
@@ -579,15 +578,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                                        return true;
                                }
-                                cFYI(1, ("file id match, oplock break"));
+                                cFYI(1, "file id match, oplock break");
                                pCifsInode = CIFS_I(netfile->pInode);
                                pCifsInode->clientCanCacheAll = false;
                                if (pSMB->OplockLevel == 0)
                                        pCifsInode->clientCanCacheRead = false;
                                rc = slow_work_enqueue(&netfile->oplock_break);
                                if (rc) {
-                                        cERROR(1, ("failed to enqueue oplock "
+                                        cERROR(1, "failed to enqueue oplock "
-                                                   "break: %d\n", rc));
+                                                   "break: %d\n", rc);
                                } else {
                                        netfile->oplock_break_cancelled = false;
                                }
@@ -597,12 +596,12 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                        }
                        read_unlock(&GlobalSMBSeslock);
                        read_unlock(&cifs_tcp_ses_lock);
-                        cFYI(1, ("No matching file for oplock break"));
+                        cFYI(1, "No matching file for oplock break");
                        return true;
                }
        }
        read_unlock(&cifs_tcp_ses_lock);
-        cFYI(1, ("Can not process oplock break for non-existent connection"));
+        cFYI(1, "Can not process oplock break for non-existent connection");
        return true;
 }
@@ -721,11 +720,11 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 {
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
-                cERROR(1, ("Autodisabling the use of server inode numbers on "
+                cERROR(1, "Autodisabling the use of server inode numbers on "
                           "%s. This server doesn't seem to support them "
                           "properly. Hardlinks will not be recognized on this "
                           "mount. Consider mounting with the \"noserverino\" "
                           "option to silence this message.",
-                           cifs_sb->tcon->treeName));
+                           cifs_sb->tcon->treeName);
        }
 }
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index bd6d6895730d..d35d52889cb5 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -149,7 +149,7 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
        else if (address_family == AF_INET6)
                ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
-        cFYI(DBG2, ("address conversion returned %d for %s", ret, cp));
+        cFYI(DBG2, "address conversion returned %d for %s", ret, cp);
        if (ret > 0)
                ret = 1;
        return ret;
@@ -870,8 +870,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
        }
        /* else ERRHRD class errors or junk  - return EIO */
-        cFYI(1, ("Mapping smb error code %d to POSIX err %d",
+        cFYI(1, "Mapping smb error code %d to POSIX err %d",
-                 smberrcode, rc));
+                 smberrcode, rc);
        /* generic corrective action e.g. reconnect SMB session on
         * ERRbaduid could be added */
@@ -940,20 +940,20 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
        SMB_TIME *st = (SMB_TIME *)&time;
        SMB_DATE *sd = (SMB_DATE *)&date;
-        cFYI(1, ("date %d time %d", date, time));
+        cFYI(1, "date %d time %d", date, time);
        sec = 2 * st->TwoSeconds;
        min = st->Minutes;
        if ((sec > 59) || (min > 59))
-                cERROR(1, ("illegal time min %d sec %d", min, sec));
+                cERROR(1, "illegal time min %d sec %d", min, sec);
        sec += (min * 60);
        sec += 60 * 60 * st->Hours;
        if (st->Hours > 24)
-                cERROR(1, ("illegal hours %d", st->Hours));
+                cERROR(1, "illegal hours %d", st->Hours);
        days = sd->Day;
        month = sd->Month;
        if ((days > 31) || (month > 12)) {
-                cERROR(1, ("illegal date, month %d day: %d", month, days));
+                cERROR(1, "illegal date, month %d day: %d", month, days);
                if (month > 12)
                        month = 12;
        }
@@ -979,7 +979,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
        ts.tv_sec = sec + offset;
-        /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
+        /* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */
        ts.tv_nsec = 0;
        return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c343b14ba2d3..daf1753af674 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -22,6 +22,7 @@
 */
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/stat.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -46,15 +47,15 @@ static void dump_cifs_file_struct(struct file *file, char *label)
        if (file) {
                cf = file->private_data;
                if (cf == NULL) {
-                        cFYI(1, ("empty cifs private file data"));
+                        cFYI(1, "empty cifs private file data");
                        return;
                }
                if (cf->invalidHandle)
-                        cFYI(1, ("invalid handle"));
+                        cFYI(1, "invalid handle");
                if (cf->srch_inf.endOfSearch)
-                        cFYI(1, ("end of search"));
+                        cFYI(1, "end of search");
                if (cf->srch_inf.emptyDir)
-                        cFYI(1, ("empty dir"));
+                        cFYI(1, "empty dir");
        }
 }
 #else
@@ -75,7 +76,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
        struct inode *inode;
        struct super_block *sb = parent->d_inode->i_sb;
-        cFYI(1, ("For %s", name->name));
+        cFYI(1, "For %s", name->name);
        if (parent->d_op && parent->d_op->d_hash)
                parent->d_op->d_hash(parent, name);
@@ -213,7 +214,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
                                fid,
                                cifs_sb->local_nls);
                if (CIFSSMBClose(xid, ptcon, fid)) {
-                        cFYI(1, ("Error closing temporary reparsepoint open)"));
+                        cFYI(1, "Error closing temporary reparsepoint open");
                }
        }
 }
@@ -251,7 +252,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
        if (full_path == NULL)
                return -ENOMEM;
-        cFYI(1, ("Full path: %s start at: %lld", full_path, file->f_pos));
+        cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
 ffirst_retry:
        /* test for Unix extensions */
@@ -296,7 +297,7 @@ static int cifs_unicode_bytelen(char *str)
                if (ustr[len] == 0)
                        return len << 1;
        }
-        cFYI(1, ("Unicode string longer than PATH_MAX found"));
+        cFYI(1, "Unicode string longer than PATH_MAX found");
        return len << 1;
 }
@@ -313,19 +314,18 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
                                pfData->FileNameLength;
        } else
                new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
-        cFYI(1, ("new entry %p old entry %p", new_entry, old_entry));
+        cFYI(1, "new entry %p old entry %p", new_entry, old_entry);
        /* validate that new_entry is not past end of SMB */
        if (new_entry >= end_of_smb) {
-                cERROR(1,
+                cERROR(1, "search entry %p began after end of SMB %p old entry %p",
-                      ("search entry %p began after end of SMB %p old entry %p",
+                        new_entry, end_of_smb, old_entry);
-                        new_entry, end_of_smb, old_entry));
                return NULL;
        } else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
                    (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
                  || ((level != SMB_FIND_FILE_INFO_STANDARD) &&
                   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
-                cERROR(1, ("search entry %p extends after end of SMB %p",
+                cERROR(1, "search entry %p extends after end of SMB %p",
-                        new_entry, end_of_smb));
+                        new_entry, end_of_smb);
                return NULL;
        } else
                return new_entry;
@@ -379,8 +379,8 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
                filename = &pFindData->FileName[0];
                len = pFindData->FileNameLength;
        } else {
-                cFYI(1, ("Unknown findfirst level %d",
+                cFYI(1, "Unknown findfirst level %d",
-                         cfile->srch_inf.info_level));
+                         cfile->srch_inf.info_level);
        }
        if (filename) {
@@ -480,7 +480,7 @@ static int cifs_save_resume_key(const char *current_entry,
                len = (unsigned int)pFindData->FileNameLength;
                cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
        } else {
-                cFYI(1, ("Unknown findfirst level %d", level));
+                cFYI(1, "Unknown findfirst level %d", level);
                return -EINVAL;
        }
        cifsFile->srch_inf.resume_name_len = len;
@@ -524,7 +524,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
             is_dir_changed(file)) ||
           (index_to_find < first_entry_in_buffer)) {
                /* close and restart search */
-                cFYI(1, ("search backing up - close and restart search"));
+                cFYI(1, "search backing up - close and restart search");
                write_lock(&GlobalSMBSeslock);
                if (!cifsFile->srch_inf.endOfSearch &&
                    !cifsFile->invalidHandle) {
@@ -534,7 +534,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                } else
                        write_unlock(&GlobalSMBSeslock);
                if (cifsFile->srch_inf.ntwrk_buf_start) {
-                        cFYI(1, ("freeing SMB ff cache buf on search rewind"));
+                        cFYI(1, "freeing SMB ff cache buf on search rewind");
                        if (cifsFile->srch_inf.smallBuf)
                                cifs_small_buf_release(cifsFile->srch_inf.
                                                ntwrk_buf_start);
@@ -545,8 +545,8 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                }
                rc = initiate_cifs_search(xid, file);
                if (rc) {
-                        cFYI(1, ("error %d reinitiating a search on rewind",
+                        cFYI(1, "error %d reinitiating a search on rewind",
-                                 rc));
+                                 rc);
                        return rc;
                }
                cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -554,7 +554,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
        while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
              (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
-                cFYI(1, ("calling findnext2"));
+                cFYI(1, "calling findnext2");
                rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
                                  &cifsFile->srch_inf);
                cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -574,7 +574,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
                                        - cifsFile->srch_inf.entries_in_buffer;
                pos_in_buf = index_to_find - first_entry_in_buffer;
-                cFYI(1, ("found entry - pos_in_buf %d", pos_in_buf));
+                cFYI(1, "found entry - pos_in_buf %d", pos_in_buf);
                for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) {
                        /* go entry by entry figuring out which is first */
@@ -583,19 +583,19 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                }
                if ((current_entry == NULL) && (i < pos_in_buf)) {
                        /* BB fixme - check if we should flag this error */
-                        cERROR(1, ("reached end of buf searching for pos in buf"
+                        cERROR(1, "reached end of buf searching for pos in buf"
                          " %d index to find %lld rc %d",
-                          pos_in_buf, index_to_find, rc));
+                          pos_in_buf, index_to_find, rc);
                }
                rc = 0;
                *ppCurrentEntry = current_entry;
        } else {
-                cFYI(1, ("index not in buffer - could not findnext into it"));
+                cFYI(1, "index not in buffer - could not findnext into it");
                return 0;
        }
        if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) {
-                cFYI(1, ("can not return entries pos_in_buf beyond last"));
+                cFYI(1, "can not return entries pos_in_buf beyond last");
                *num_to_ret = 0;
        } else
                *num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf;
@@ -655,12 +655,12 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                /* one byte length, no name conversion */
                len = (unsigned int)pFindData->FileNameLength;
        } else {
-                cFYI(1, ("Unknown findfirst level %d", level));
+                cFYI(1, "Unknown findfirst level %d", level);
                return -EINVAL;
        }
        if (len > max_len) {
-                cERROR(1, ("bad search response length %d past smb end", len));
+                cERROR(1, "bad search response length %d past smb end", len);
                return -EINVAL;
        }
@@ -753,7 +753,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
         * case already. Why should we be clobbering other errors from it?
         */
        if (rc) {
-                cFYI(1, ("filldir rc = %d", rc));
+                cFYI(1, "filldir rc = %d", rc);
                rc = -EOVERFLOW;
        }
        dput(tmp_dentry);
@@ -785,7 +785,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        case 0:
                if (filldir(direntry, ".", 1, file->f_pos,
                     file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) {
-                        cERROR(1, ("Filldir for current dir failed"));
+                        cERROR(1, "Filldir for current dir failed");
                        rc = -ENOMEM;
                        break;
                }
@@ -793,7 +793,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        case 1:
                if (filldir(direntry, "..", 2, file->f_pos,
                     file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
-                        cERROR(1, ("Filldir for parent dir failed"));
+                        cERROR(1, "Filldir for parent dir failed");
                        rc = -ENOMEM;
                        break;
                }
@@ -806,7 +806,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                if (file->private_data == NULL) {
                        rc = initiate_cifs_search(xid, file);
-                        cFYI(1, ("initiate cifs search rc %d", rc));
+                        cFYI(1, "initiate cifs search rc %d", rc);
                        if (rc) {
                                FreeXid(xid);
                                return rc;
@@ -820,7 +820,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                cifsFile = file->private_data;
                if (cifsFile->srch_inf.endOfSearch) {
                        if (cifsFile->srch_inf.emptyDir) {
-                                cFYI(1, ("End of search, empty dir"));
+                                cFYI(1, "End of search, empty dir");
                                rc = 0;
                                break;
                        }
@@ -832,16 +832,16 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                rc = find_cifs_entry(xid, pTcon, file,
                                &current_entry, &num_to_fill);
                if (rc) {
-                        cFYI(1, ("fce error %d", rc));
+                        cFYI(1, "fce error %d", rc);
                        goto rddir2_exit;
                } else if (current_entry != NULL) {
-                        cFYI(1, ("entry %lld found", file->f_pos));
+                        cFYI(1, "entry %lld found", file->f_pos);
                } else {
-                        cFYI(1, ("could not find entry"));
+                        cFYI(1, "could not find entry");
                        goto rddir2_exit;
                }
-                cFYI(1, ("loop through %d times filling dir for net buf %p",
+                cFYI(1, "loop through %d times filling dir for net buf %p",
-                        num_to_fill, cifsFile->srch_inf.ntwrk_buf_start));
+                        num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
                max_len = smbCalcSize((struct smb_hdr *)
                                cifsFile->srch_inf.ntwrk_buf_start);
                end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
@@ -850,8 +850,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
                        if (current_entry == NULL) {
                                /* evaluate whether this case is an error */
-                                cERROR(1, ("past SMB end,  num to fill %d i %d",
+                                cERROR(1, "past SMB end,  num to fill %d i %d",
-                                          num_to_fill, i));
+                                          num_to_fill, i);
                                break;
                        }
                        /* if buggy server returns . and .. late do
@@ -866,8 +866,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                        file->f_pos++;
                        if (file->f_pos ==
                                cifsFile->srch_inf.index_of_last_entry) {
-                                cFYI(1, ("last entry in buf at pos %lld %s",
+                                cFYI(1, "last entry in buf at pos %lld %s",
-                                        file->f_pos, tmp_buf));
+                                        file->f_pos, tmp_buf);
                                cifs_save_resume_key(current_entry, cifsFile);
                                break;
                        } else
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index aaa9c1c5a5bd..7707389bdf2c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,14 +29,17 @@
 #include "ntlmssp.h"
 #include "nterr.h"
 #include <linux/utsname.h>
+#include <linux/slab.h>
 #include "cifs_spnego.h"
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
                         unsigned char *p24);
-/* Checks if this is the first smb session to be reconnected after
+/*
-   the socket has been reestablished (so we know whether to use vc 0).
+ * Checks if this is the first smb session to be reconnected after
-   Called while holding the cifs_tcp_ses_lock, so do not block */
+ * the socket has been reestablished (so we know whether to use vc 0).
+ * Called while holding the cifs_tcp_ses_lock, so do not block
+ */
 static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
 {
        struct list_head *tmp;
@@ -283,7 +286,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        int len;
        char *data = *pbcc_area;
-        cFYI(1, ("bleft %d", bleft));
+        cFYI(1, "bleft %d", bleft);
        /*
         * Windows servers do not always double null terminate their final
@@ -300,7 +303,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        kfree(ses->serverOS);
        ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-        cFYI(1, ("serverOS=%s", ses->serverOS));
+        cFYI(1, "serverOS=%s", ses->serverOS);
        len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
        data += len;
        bleft -= len;
@@ -309,7 +312,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        kfree(ses->serverNOS);
        ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-        cFYI(1, ("serverNOS=%s", ses->serverNOS));
+        cFYI(1, "serverNOS=%s", ses->serverNOS);
        len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
        data += len;
        bleft -= len;
@@ -318,7 +321,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        kfree(ses->serverDomain);
        ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-        cFYI(1, ("serverDomain=%s", ses->serverDomain));
+        cFYI(1, "serverDomain=%s", ses->serverDomain);
        return;
 }
@@ -331,7 +334,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
        int len;
        char *bcc_ptr = *pbcc_area;
-        cFYI(1, ("decode sessetup ascii. bleft %d", bleft));
+        cFYI(1, "decode sessetup ascii. bleft %d", bleft);
        len = strnlen(bcc_ptr, bleft);
        if (len >= bleft)
@@ -343,7 +346,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
        if (ses->serverOS)
                strncpy(ses->serverOS, bcc_ptr, len);
        if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
-                        cFYI(1, ("OS/2 server"));
+                        cFYI(1, "OS/2 server");
                        ses->flags |= CIFS_SES_OS2;
        }
@@ -372,7 +375,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
        /* BB For newer servers which do not support Unicode,
           but thus do return domain here we could add parsing
           for it later, but it is not very important */
-        cFYI(1, ("ascii: bytes left %d", bleft));
+        cFYI(1, "ascii: bytes left %d", bleft);
        return rc;
 }
@@ -383,16 +386,16 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
        CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
        if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
-                cERROR(1, ("challenge blob len %d too small", blob_len));
+                cERROR(1, "challenge blob len %d too small", blob_len);
                return -EINVAL;
        }
        if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
-                cERROR(1, ("blob signature incorrect %s", pblob->Signature));
+                cERROR(1, "blob signature incorrect %s", pblob->Signature);
                return -EINVAL;
        }
        if (pblob->MessageType != NtLmChallenge) {
-                cERROR(1, ("Incorrect message type %d", pblob->MessageType));
+                cERROR(1, "Incorrect message type %d", pblob->MessageType);
                return -EINVAL;
        }
@@ -446,7 +449,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
   This function returns the length of the data in the blob */
 static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                                   struct cifsSesInfo *ses,
-                                   const struct nls_table *nls_cp, int first)
+                                   const struct nls_table *nls_cp, bool first)
 {
        AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
        __u32 flags;
@@ -545,7 +548,7 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
 static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
                                  struct cifsSesInfo *ses,
-                                  const struct nls_table *nls, int first_time)
+                                  const struct nls_table *nls, bool first_time)
 {
        int bloblen;
@@ -558,8 +561,8 @@ static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
 #endif
 int
-CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
+CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
-                const struct nls_table *nls_cp)
+               const struct nls_table *nls_cp)
 {
        int rc = 0;
        int wct;
@@ -576,13 +579,18 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
        int bytes_remaining;
        struct key *spnego_key = NULL;
        __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
+        bool first_time;
        if (ses == NULL)
                return -EINVAL;
+        read_lock(&cifs_tcp_ses_lock);
+        first_time = is_first_ses_reconnect(ses);
+        read_unlock(&cifs_tcp_ses_lock);
        type = ses->server->secType;
-        cFYI(1, ("sess setup type %d", type));
+        cFYI(1, "sess setup type %d", type);
 ssetup_ntlmssp_authenticate:
        if (phase == NtLmChallenge)
                phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -663,7 +671,7 @@ ssetup_ntlmssp_authenticate:
                changed to do higher than lanman dialect and
                we reconnected would we ever calc signing_key? */
-                cFYI(1, ("Negotiating LANMAN setting up strings"));
+                cFYI(1, "Negotiating LANMAN setting up strings");
                /* Unicode not allowed for LANMAN dialects */
                ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif
@@ -743,7 +751,7 @@ ssetup_ntlmssp_authenticate:
                        unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
                } else
                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-        } else if (type == Kerberos || type == MSKerberos) {
+        } else if (type == Kerberos) {
 #ifdef CONFIG_CIFS_UPCALL
                struct cifs_spnego_msg *msg;
                spnego_key = cifs_get_spnego_key(ses);
@@ -757,17 +765,17 @@ ssetup_ntlmssp_authenticate:
                /* check version field to make sure that cifs.upcall is
                   sending us a response in an expected form */
                if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
-                        cERROR(1, ("incorrect version of cifs.upcall (expected"
+                        cERROR(1, "incorrect version of cifs.upcall (expected"
                                   " %d but got %d)",
-                                   CIFS_SPNEGO_UPCALL_VERSION, msg->version));
+                                   CIFS_SPNEGO_UPCALL_VERSION, msg->version);
                        rc = -EKEYREJECTED;
                        goto ssetup_exit;
                }
                /* bail out if key is too long */
                if (msg->sesskey_len >
                    sizeof(ses->server->mac_signing_key.data.krb5)) {
-                        cERROR(1, ("Kerberos signing key too long (%u bytes)",
+                        cERROR(1, "Kerberos signing key too long (%u bytes)",
-                                msg->sesskey_len));
+                                msg->sesskey_len);
                        rc = -EOVERFLOW;
                        goto ssetup_exit;
                }
@@ -795,7 +803,7 @@ ssetup_ntlmssp_authenticate:
                /* BB: is this right? */
                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #else /* ! CONFIG_CIFS_UPCALL */
-                cERROR(1, ("Kerberos negotiated but upcall support disabled!"));
+                cERROR(1, "Kerberos negotiated but upcall support disabled!");
                rc = -ENOSYS;
                goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
@@ -803,12 +811,12 @@ ssetup_ntlmssp_authenticate:
 #ifdef CONFIG_CIFS_EXPERIMENTAL
                if (type == RawNTLMSSP) {
                        if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-                                cERROR(1, ("NTLMSSP requires Unicode support"));
+                                cERROR(1, "NTLMSSP requires Unicode support");
                                rc = -ENOSYS;
                                goto ssetup_exit;
                        }
-                        cFYI(1, ("ntlmssp session setup phase %d", phase));
+                        cFYI(1, "ntlmssp session setup phase %d", phase);
                        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
                        capabilities |= CAP_EXTENDED_SECURITY;
                        pSMB->req.Capabilities |= cpu_to_le32(capabilities);
@@ -826,7 +834,7 @@ ssetup_ntlmssp_authenticate:
                                   on the response (challenge) */
                                smb_buf->Uid = ses->Suid;
                        } else {
-                                cERROR(1, ("invalid phase %d", phase));
+                                cERROR(1, "invalid phase %d", phase);
                                rc = -ENOSYS;
                                goto ssetup_exit;
                        }
@@ -838,12 +846,12 @@ ssetup_ntlmssp_authenticate:
                        }
                        unicode_oslm_strings(&bcc_ptr, nls_cp);
                } else {
-                        cERROR(1, ("secType %d not supported!", type));
+                        cERROR(1, "secType %d not supported!", type);
                        rc = -ENOSYS;
                        goto ssetup_exit;
                }
 #else
-                cERROR(1, ("secType %d not supported!", type));
+                cERROR(1, "secType %d not supported!", type);
                rc = -ENOSYS;
                goto ssetup_exit;
 #endif
@@ -861,7 +869,7 @@ ssetup_ntlmssp_authenticate:
                          CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
        /* SMB request buf freed in SendReceive2 */
-        cFYI(1, ("ssetup rc from sendrecv2 is %d", rc));
+        cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
        pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
        smb_buf = (struct smb_hdr *)iov[0].iov_base;
@@ -869,7 +877,7 @@ ssetup_ntlmssp_authenticate:
        if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
                        cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
                if (phase != NtLmNegotiate) {
-                        cERROR(1, ("Unexpected more processing error"));
+                        cERROR(1, "Unexpected more processing error");
                        goto ssetup_exit;
                }
                /* NTLMSSP Negotiate sent now processing challenge (response) */
@@ -881,14 +889,14 @@ ssetup_ntlmssp_authenticate:
        if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
                rc = -EIO;
-                cERROR(1, ("bad word count %d", smb_buf->WordCount));
+                cERROR(1, "bad word count %d", smb_buf->WordCount);
                goto ssetup_exit;
        }
        action = le16_to_cpu(pSMB->resp.Action);
        if (action & GUEST_LOGIN)
-                cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */
+                cFYI(1, "Guest login"); /* BB mark SesInfo struct? */
        ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
-        cFYI(1, ("UID = %d ", ses->Suid));
+        cFYI(1, "UID = %d ", ses->Suid);
        /* response can have either 3 or 4 word count - Samba sends 3 */
        /* and lanman response is 3 */
        bytes_remaining = BCC(smb_buf);
@@ -898,7 +906,7 @@ ssetup_ntlmssp_authenticate:
                __u16 blob_len;
                blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
                if (blob_len > bytes_remaining) {
-                        cERROR(1, ("bad security blob length %d", blob_len));
+                        cERROR(1, "bad security blob length %d", blob_len);
                        rc = -EINVAL;
                        goto ssetup_exit;
                }
@@ -932,7 +940,7 @@ ssetup_exit:
        }
        kfree(str_area);
        if (resp_buf_type == CIFS_SMALL_BUFFER) {
-                cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
+                cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
                cifs_small_buf_release(iov[0].iov_base);
        } else if (resp_buf_type == CIFS_LARGE_BUFFER)
                cifs_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 93fb09a99c69..192ea51af20f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -24,6 +24,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 07b8e71544ee..82f78c4d6978 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/list.h>
+#include <linux/gfp.h>
 #include <linux/wait.h>
 #include <linux/net.h>
 #include <linux/delay.h>
@@ -34,7 +35,6 @@
 #include "cifs_debug.h"
 extern mempool_t *cifs_mid_poolp;
-extern struct kmem_cache *cifs_oplock_cachep;
 static struct mid_q_entry *
 AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
@@ -42,7 +42,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
        struct mid_q_entry *temp;
        if (server == NULL) {
-                cERROR(1, ("Null TCP session in AllocMidQEntry"));
+                cERROR(1, "Null TCP session in AllocMidQEntry");
                return NULL;
        }
@@ -54,7 +54,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
                temp->mid = smb_buffer->Mid;    /* always LE */
                temp->pid = current->pid;
                temp->command = smb_buffer->Command;
-                cFYI(1, ("For smb_command %d", temp->command));
+                cFYI(1, "For smb_command %d", temp->command);
        /*      do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
                /* when mid allocated can be before when sent */
                temp->when_alloc = jiffies;
@@ -139,7 +139,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
                total_len += iov[i].iov_len;
        smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
-        cFYI(1, ("Sending smb:  total_len %d", total_len));
+        cFYI(1, "Sending smb:  total_len %d", total_len);
        dump_smb(smb_buffer, len);
        i = 0;
@@ -167,9 +167,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
                           reconnect which may clear the network problem.
                        */
                        if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
-                                cERROR(1,
+                                cERROR(1, "sends on sock %p stuck for 15 seconds",
-                                   ("sends on sock %p stuck for 15 seconds",
+                                    ssocket);
-                                    ssocket));
                                rc = -EAGAIN;
                                break;
                        }
@@ -183,13 +182,13 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
                        total_len = 0;
                        break;
                } else if (rc > total_len) {
-                        cERROR(1, ("sent %d requested %d", rc, total_len));
+                        cERROR(1, "sent %d requested %d", rc, total_len);
                        break;
                }
                if (rc == 0) {
                        /* should never happen, letting socket clear before
                           retrying is our only obvious option here */
-                        cERROR(1, ("tcp sent no data"));
+                        cERROR(1, "tcp sent no data");
                        msleep(500);
                        continue;
                }
@@ -212,8 +211,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        }
        if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
-                cFYI(1, ("partial send (%d remaining), terminating session",
+                cFYI(1, "partial send (%d remaining), terminating session",
-                        total_len));
+                        total_len);
                /* If we have only sent part of an SMB then the next SMB
                   could be taken as the remainder of this one.  We need
                   to kill the socket so the server throws away the partial
@@ -222,7 +221,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        }
        if (rc < 0) {
-                cERROR(1, ("Error %d sending data on socket to server", rc));
+                cERROR(1, "Error %d sending data on socket to server", rc);
        } else
                rc = 0;
@@ -295,7 +294,7 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
        }
        if (ses->server->tcpStatus == CifsNeedReconnect) {
-                cFYI(1, ("tcp session dead - return to caller to retry"));
+                cFYI(1, "tcp session dead - return to caller to retry");
                return -EAGAIN;
        }
@@ -347,7 +346,7 @@ static int wait_for_response(struct cifsSesInfo *ses,
                        lrt += time_to_wait;
                        if (time_after(jiffies, lrt)) {
                                /* No replies for time_to_wait. */
-                                cERROR(1, ("server not responding"));
+                                cERROR(1, "server not responding");
                                return -1;
                        }
                } else {
@@ -378,7 +377,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
        iov[0].iov_len = in_buf->smb_buf_length + 4;
        flags |= CIFS_NO_RESP;
        rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
-        cFYI(DBG2, ("SendRcvNoRsp flags %d rc %d", flags, rc));
+        cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
        return rc;
 }
@@ -401,7 +400,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        if ((ses == NULL) || (ses->server == NULL)) {
                cifs_small_buf_release(in_buf);
-                cERROR(1, ("Null session"));
+                cERROR(1, "Null session");
                return -EIO;
        }
@@ -470,7 +469,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        else if (long_op == CIFS_BLOCKING_OP)
                timeout = 0x7FFFFFFF; /*  large, but not so large as to wrap */
        else {
-                cERROR(1, ("unknown timeout flag %d", long_op));
+                cERROR(1, "unknown timeout flag %d", long_op);
                rc = -EIO;
                goto out;
        }
@@ -489,8 +488,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        spin_lock(&GlobalMid_Lock);
        if (midQ->resp_buf == NULL) {
-                cERROR(1, ("No response to cmd %d mid %d",
+                cERROR(1, "No response to cmd %d mid %d",
-                        midQ->command, midQ->mid));
+                        midQ->command, midQ->mid);
                if (midQ->midState == MID_REQUEST_SUBMITTED) {
                        if (ses->server->tcpStatus == CifsExiting)
                                rc = -EHOSTDOWN;
@@ -503,7 +502,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                if (rc != -EHOSTDOWN) {
                        if (midQ->midState == MID_RETRY_NEEDED) {
                                rc = -EAGAIN;
-                                cFYI(1, ("marking request for retry"));
+                                cFYI(1, "marking request for retry");
                        } else {
                                rc = -EIO;
                        }
@@ -520,8 +519,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
+                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid));
+                        receive_len, xid);
                rc = -EIO;
                goto out;
        }
@@ -547,7 +546,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                                                &ses->server->mac_signing_key,
                                                midQ->sequence_number+1);
                        if (rc) {
-                                cERROR(1, ("Unexpected SMB signature"));
+                                cERROR(1, "Unexpected SMB signature");
                                /* BB FIXME add code to kill session */
                        }
                }
@@ -568,7 +567,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                                                   DeleteMidQEntry */
        } else {
                rc = -EIO;
-                cFYI(1, ("Bad MID state?"));
+                cFYI(1, "Bad MID state?");
        }
 out:
@@ -590,11 +589,11 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        struct mid_q_entry *midQ;
        if (ses == NULL) {
-                cERROR(1, ("Null smb session"));
+                cERROR(1, "Null smb session");
                return -EIO;
        }
        if (ses->server == NULL) {
-                cERROR(1, ("Null tcp session"));
+                cERROR(1, "Null tcp session");
                return -EIO;
        }
@@ -606,8 +605,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
           use ses->maxReq */
        if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-                cERROR(1, ("Illegal length, greater than maximum frame, %d",
+                cERROR(1, "Illegal length, greater than maximum frame, %d",
-                           in_buf->smb_buf_length));
+                           in_buf->smb_buf_length);
                return -EIO;
        }
@@ -664,7 +663,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        else if (long_op == CIFS_BLOCKING_OP)
                timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
        else {
-                cERROR(1, ("unknown timeout flag %d", long_op));
+                cERROR(1, "unknown timeout flag %d", long_op);
                rc = -EIO;
                goto out;
        }
@@ -680,8 +679,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        spin_lock(&GlobalMid_Lock);
        if (midQ->resp_buf == NULL) {
-                cERROR(1, ("No response for cmd %d mid %d",
+                cERROR(1, "No response for cmd %d mid %d",
-                          midQ->command, midQ->mid));
+                          midQ->command, midQ->mid);
                if (midQ->midState == MID_REQUEST_SUBMITTED) {
                        if (ses->server->tcpStatus == CifsExiting)
                                rc = -EHOSTDOWN;
@@ -694,7 +693,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                if (rc != -EHOSTDOWN) {
                        if (midQ->midState == MID_RETRY_NEEDED) {
                                rc = -EAGAIN;
-                                cFYI(1, ("marking request for retry"));
+                                cFYI(1, "marking request for retry");
                        } else {
                                rc = -EIO;
                        }
@@ -711,8 +710,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
+                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid));
+                        receive_len, xid);
                rc = -EIO;
                goto out;
        }
@@ -735,7 +734,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                                                &ses->server->mac_signing_key,
                                                midQ->sequence_number+1);
                        if (rc) {
-                                cERROR(1, ("Unexpected SMB signature"));
+                                cERROR(1, "Unexpected SMB signature");
                                /* BB FIXME add code to kill session */
                        }
                }
@@ -752,7 +751,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                        BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
        } else {
                rc = -EIO;
-                cERROR(1, ("Bad MID state?"));
+                cERROR(1, "Bad MID state?");
        }
 out:
@@ -823,13 +822,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        struct cifsSesInfo *ses;
        if (tcon == NULL || tcon->ses == NULL) {
-                cERROR(1, ("Null smb session"));
+                cERROR(1, "Null smb session");
                return -EIO;
        }
        ses = tcon->ses;
        if (ses->server == NULL) {
-                cERROR(1, ("Null tcp session"));
+                cERROR(1, "Null tcp session");
                return -EIO;
        }
@@ -841,8 +840,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
           use ses->maxReq */
        if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-                cERROR(1, ("Illegal length, greater than maximum frame, %d",
+                cERROR(1, "Illegal length, greater than maximum frame, %d",
-                           in_buf->smb_buf_length));
+                           in_buf->smb_buf_length);
                return -EIO;
        }
@@ -932,8 +931,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                spin_unlock(&GlobalMid_Lock);
                receive_len = midQ->resp_buf->smb_buf_length;
        } else {
-                cERROR(1, ("No response for cmd %d mid %d",
+                cERROR(1, "No response for cmd %d mid %d",
-                          midQ->command, midQ->mid));
+                          midQ->command, midQ->mid);
                if (midQ->midState == MID_REQUEST_SUBMITTED) {
                        if (ses->server->tcpStatus == CifsExiting)
                                rc = -EHOSTDOWN;
@@ -946,7 +945,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                if (rc != -EHOSTDOWN) {
                        if (midQ->midState == MID_RETRY_NEEDED) {
                                rc = -EAGAIN;
-                                cFYI(1, ("marking request for retry"));
+                                cFYI(1, "marking request for retry");
                        } else {
                                rc = -EIO;
                        }
@@ -957,8 +956,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        }
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
+                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid));
+                        receive_len, xid);
                rc = -EIO;
                goto out;
        }
@@ -967,7 +966,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) {
                rc = -EIO;
-                cERROR(1, ("Bad MID state?"));
+                cERROR(1, "Bad MID state?");
                goto out;
        }
@@ -985,7 +984,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                                           &ses->server->mac_signing_key,
                                           midQ->sequence_number+1);
                if (rc) {
-                        cERROR(1, ("Unexpected SMB signature"));
+                        cERROR(1, "Unexpected SMB signature");
                        /* BB FIXME add code to kill session */
                }
        }
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 3e2ef0de1209..a1509207bfa6 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -69,12 +70,12 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
                return rc;
        }
        if (ea_name == NULL) {
-                cFYI(1, ("Null xattr names not supported"));
+                cFYI(1, "Null xattr names not supported");
        } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
                && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
                cFYI(1,
-                    ("illegal xattr request %s (only user namespace supported)",
+                     "illegal xattr request %s (only user namespace supported)",
-                        ea_name));
+                     ea_name);
                /* BB what if no namespace prefix? */
                /* Should we just pass them to server, except for
                system and perhaps security prefixes? */
@@ -130,19 +131,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                search server for EAs or streams to
                returns as xattrs */
        if (value_size > MAX_EA_VALUE_SIZE) {
-                cFYI(1, ("size of EA value too large"));
+                cFYI(1, "size of EA value too large");
                kfree(full_path);
                FreeXid(xid);
                return -EOPNOTSUPP;
        }
        if (ea_name == NULL) {
-                cFYI(1, ("Null xattr names not supported"));
+                cFYI(1, "Null xattr names not supported");
        } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto set_ea_exit;
                if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
-                        cFYI(1, ("attempt to set cifs inode metadata"));
+                        cFYI(1, "attempt to set cifs inode metadata");
                ea_name += 5; /* skip past user. prefix */
                rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
@@ -168,9 +169,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                                        ACL_TYPE_ACCESS, cifs_sb->local_nls,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        cFYI(1, ("set POSIX ACL rc %d", rc));
+                        cFYI(1, "set POSIX ACL rc %d", rc);
 #else
-                        cFYI(1, ("set POSIX ACL not supported"));
+                        cFYI(1, "set POSIX ACL not supported");
 #endif
                } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
                                   strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -181,13 +182,13 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                                        ACL_TYPE_DEFAULT, cifs_sb->local_nls,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        cFYI(1, ("set POSIX default ACL rc %d", rc));
+                        cFYI(1, "set POSIX default ACL rc %d", rc);
 #else
-                        cFYI(1, ("set default POSIX ACL not supported"));
+                        cFYI(1, "set default POSIX ACL not supported");
 #endif
                } else {
-                        cFYI(1, ("illegal xattr request %s (only user namespace"
+                        cFYI(1, "illegal xattr request %s (only user namespace"
-                                 " supported)", ea_name));
+                                " supported)", ea_name);
                  /* BB what if no namespace prefix? */
                  /* Should we just pass them to server, except for
                  system and perhaps security prefixes? */
@@ -234,13 +235,13 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
        if (ea_name == NULL) {
-                cFYI(1, ("Null xattr names not supported"));
+                cFYI(1, "Null xattr names not supported");
        } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto get_ea_exit;
                if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
-                        cFYI(1, ("attempt to query cifs inode metadata"));
+                        cFYI(1, "attempt to query cifs inode metadata");
                        /* revalidate/getattr then populate from inode */
                } /* BB add else when above is implemented */
                ea_name += 5; /* skip past user. prefix */
@@ -286,7 +287,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                }
 #endif /* EXPERIMENTAL */
 #else
-                cFYI(1, ("query POSIX ACL not supported yet"));
+                cFYI(1, "query POSIX ACL not supported yet");
 #endif /* CONFIG_CIFS_POSIX */
        } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
                          strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -298,18 +299,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
 #else
-                cFYI(1, ("query POSIX default ACL not supported yet"));
+                cFYI(1, "query POSIX default ACL not supported yet");
 #endif
        } else if (strncmp(ea_name,
                  CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
-                cFYI(1, ("Trusted xattr namespace not supported yet"));
+                cFYI(1, "Trusted xattr namespace not supported yet");
        } else if (strncmp(ea_name,
                  CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
-                cFYI(1, ("Security xattr namespace not supported yet"));
+                cFYI(1, "Security xattr namespace not supported yet");
        } else
                cFYI(1,
-                    ("illegal xattr request %s (only user namespace supported)",
+                    "illegal xattr request %s (only user namespace supported)",
-                        ea_name));
+                     ea_name);
        /* We could add an additional check for streams ie
            if proc/fs/cifs/streamstoxattr is set then
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4bb9d0a5decc..ccd98b0f2b0b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/time.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ffd42815fda1..4c813f2cdc52 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/coda.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 830f51abb971..d97f9935a028 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -18,6 +18,7 @@
 #include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/vfs.h>
+#include <linux/slab.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -166,6 +167,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
                return -EBUSY;
        }
+        error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
+        if (error)
+                goto bdi_err;
        vc->vc_sb = sb;
        sb->s_fs_info = vc;
@@ -174,6 +179,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_blocksize_bits = 12;
        sb->s_magic = CODA_SUPER_MAGIC;
        sb->s_op = &coda_super_operations;
+        sb->s_bdi = &vc->bdi;
        /* get root fid from Venus: this needs the root inode */
        error = venus_rootfid(sb, &fid);
@@ -199,6 +205,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 error:
+        bdi_destroy(&vc->bdi);
+ bdi_err:
        if (root)
                iput(root);
        if (vc)
@@ -209,6 +217,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 static void coda_put_super(struct super_block *sb)
 {
+        bdi_destroy(&coda_vcp(sb)->bdi);
        coda_vcp(sb)->vc_sb = NULL;
        sb->s_fs_info = NULL;
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c274d949179d..f09c5ed76f6c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -26,6 +26,7 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
diff --git a/fs/compat.c b/fs/compat.c
index 00d90c2e66f0..05448730f840 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -49,6 +49,7 @@
 #include <linux/mm.h>
 #include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1530,8 +1531,6 @@ int compat_do_execve(char * filename,
        if (retval < 0)
                goto out;
-        current->stack_start = current->mm->start_stack;
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
@@ -1795,6 +1794,24 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
        return ret;
 }
+struct compat_sel_arg_struct {
+        compat_ulong_t n;
+        compat_uptr_t inp;
+        compat_uptr_t outp;
+        compat_uptr_t exp;
+        compat_uptr_t tvp;
+};
+asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
+{
+        struct compat_sel_arg_struct a;
+        if (copy_from_user(&a, arg, sizeof(a)))
+                return -EFAULT;
+        return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+                                 compat_ptr(a.exp), compat_ptr(a.tvp));
+}
 #ifdef HAVE_SET_RESTORE_SIGMASK
 static long do_compat_pselect(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6d55b61bfa79..641640dc7ae5 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,6 @@
 #include <linux/ioctl.h>
 #include <linux/if.h>
 #include <linux/if_bridge.h>
-#include <linux/slab.h>
 #include <linux/raid/md_u.h>
 #include <linux/kd.h>
 #include <linux/route.h>
@@ -60,6 +59,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
 #include <linux/atalk.h>
+#include <linux/gfp.h>
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci.h>
@@ -102,7 +102,6 @@
 #include <linux/nbd.h>
 #include <linux/random.h>
 #include <linux/filter.h>
-#include <linux/pktcdvd.h>
 #include <linux/hiddev.h>
@@ -1126,8 +1125,6 @@ COMPATIBLE_IOCTL(PPGETMODE)
 COMPATIBLE_IOCTL(PPGETPHASE)
 COMPATIBLE_IOCTL(PPGETFLAGS)
 COMPATIBLE_IOCTL(PPSETFLAGS)
-/* pktcdvd */
-COMPATIBLE_IOCTL(PACKET_CTRL_CMD)
 /* Big A */
 /* sparc only */
 /* Big Q for sound/OSS */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e48b52205aa..0b502f80c691 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -645,6 +645,7 @@ static void detach_groups(struct config_group *group)
                configfs_detach_group(sd->s_element);
                child->d_inode->i_flags |= S_DEAD;
+                dont_mount(child);
                mutex_unlock(&child->d_inode->i_mutex);
@@ -840,6 +841,7 @@ static int configfs_attach_item(struct config_item *parent_item,
                        mutex_lock(&dentry->d_inode->i_mutex);
                        configfs_remove_dir(item);
                        dentry->d_inode->i_flags |= S_DEAD;
+                        dont_mount(dentry);
                        mutex_unlock(&dentry->d_inode->i_mutex);
                        d_delete(dentry);
                }
@@ -882,6 +884,7 @@ static int configfs_attach_group(struct config_item *parent_item,
                if (ret) {
                        configfs_detach_item(item);
                        dentry->d_inode->i_flags |= S_DEAD;
+                        dont_mount(dentry);
                }
                configfs_adjust_dir_dirent_depth_after_populate(sd);
                mutex_unlock(&dentry->d_inode->i_mutex);
@@ -1725,6 +1728,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
        mutex_unlock(&configfs_symlink_mutex);
        configfs_detach_group(&group->cg_item);
        dentry->d_inode->i_flags |= S_DEAD;
+        dont_mount(dentry);
        mutex_unlock(&dentry->d_inode->i_mutex);
        d_delete(dentry);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index a2f746066c5d..c8af2d91174b 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -34,6 +34,7 @@
 #include <linux/capability.h>
 #include <linux/sched.h>
 #include <linux/lockdep.h>
+#include <linux/slab.h>
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8421cea7d8c7..8c8d64230c2d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -29,6 +29,7 @@
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 32a5f46b1157..0f3eb41d9201 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -27,6 +27,7 @@
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 049d6c36da09..30a87b3dbcac 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,6 +27,7 @@
 #include <linux/fsnotify.h>
 #include <linux/string.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8882ecc0f1bf..0120247b41c0 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/tty.h>
 #include <linux/mutex.h>
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0df243850818..b54bca03d92f 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/configfs.h>
+#include <linux/slab.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <net/ipv6.h>
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 29d6139c35fc..c6cf25158746 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include "lock.h"
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 46ffd3eeaaf7..031dbe3a15ca 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -56,6 +56,7 @@
   L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
 */
 #include <linux/types.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include <linux/dlm_device.h>
 #include "memory.h"
@@ -732,10 +733,7 @@ static void lkb_add_ordered(struct list_head *new, struct list_head *head,
                if (lkb->lkb_rqmode < mode)
                        break;
-        if (!lkb)
+        __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
-                list_add_tail(new, head);
-        else
-                __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
 }
 /* add/remove lkb to rsb's grant/convert/wait queue */
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 26a8bd40400a..f994a7dfda85 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -148,7 +148,7 @@ static void lockspace_kobj_release(struct kobject *k)
        kfree(ls);
 }
-static struct sysfs_ops dlm_attr_ops = {
+static const struct sysfs_ops dlm_attr_ops = {
        .show  = dlm_attr_show,
        .store = dlm_attr_store,
 };
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 52cab160893c..c0d35c620526 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -51,6 +51,7 @@
 #include <linux/file.h>
 #include <linux/mutex.h>
 #include <linux/sctp.h>
+#include <linux/slab.h>
 #include <net/sctp/user.h>
 #include <net/ipv6.h>
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 84f70bfb0baf..b12532e553f8 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -312,7 +312,7 @@ int dlm_ls_stop(struct dlm_ls *ls)
        /*
         * This in_recovery lock does two things:
         * 1) Keeps this function from returning until all threads are out
-         *    of locking routines and locking is truely stopped.
+         *    of locking routines and locking is truly stopped.
         * 2) Keeps any new requests from being processed until it's unlocked
         *    when recovery is complete.
         */
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 052095cd592f..2c6ad518100d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -9,6 +9,7 @@
 #include <net/genetlink.h>
 #include <linux/dlm.h>
 #include <linux/dlm_netlink.h>
+#include <linux/gfp.h>
 #include "dlm_internal.h"
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index b5f89aef3b29..d45c02db6943 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -11,6 +11,7 @@
 #include <linux/poll.h>
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include "lockspace.h"
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index a4bfd31ac45b..b6272853130c 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include "lockspace.h"
@@ -214,6 +215,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
        if (!ast_type) {
                kref_get(&lkb->lkb_ref);
                list_add_tail(&lkb->lkb_astqueue, &proc->asts);
+                lkb->lkb_ast_first = type;
                wake_up_interruptible(&proc->wait);
        }
        if (type == AST_COMP && (ast_type & AST_COMP))
@@ -222,7 +224,6 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
        eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
        if (eol) {
-                lkb->lkb_ast_type &= ~AST_BAST;
                lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
        }
@@ -705,7 +706,7 @@ static int device_close(struct inode *inode, struct file *file)
 }
 static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
-                               int bmode, char __user *buf, size_t count)
+                               int mode, char __user *buf, size_t count)
 {
 #ifdef CONFIG_COMPAT
        struct dlm_lock_result32 result32;
@@ -732,7 +733,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
        if (type == AST_BAST) {
                result.user_astaddr = ua->bastaddr;
                result.user_astparam = ua->bastparam;
-                result.bast_mode = bmode;
+                result.bast_mode = mode;
        } else {
                result.user_astaddr = ua->castaddr;
                result.user_astparam = ua->castparam;
@@ -800,7 +801,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        struct dlm_user_proc *proc = file->private_data;
        struct dlm_lkb *lkb;
        DECLARE_WAITQUEUE(wait, current);
-        int error, type=0, bmode=0, removed = 0;
+        int error = 0, removed;
+        int ret_type, ret_mode;
+        int bastmode, castmode, do_bast, do_cast;
        if (count == sizeof(struct dlm_device_version)) {
                error = copy_version_to_user(buf, count);
@@ -819,6 +822,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 #endif
                return -EINVAL;
+ try_another:
        /* do we really need this? can a read happen after a close? */
        if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
                return -EINVAL;
@@ -854,13 +859,55 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
-        if (lkb->lkb_ast_type & AST_COMP) {
+        removed = 0;
-                lkb->lkb_ast_type &= ~AST_COMP;
+        ret_type = 0;
-                type = AST_COMP;
+        ret_mode = 0;
-        } else if (lkb->lkb_ast_type & AST_BAST) {
+        do_bast = lkb->lkb_ast_type & AST_BAST;
-                lkb->lkb_ast_type &= ~AST_BAST;
+        do_cast = lkb->lkb_ast_type & AST_COMP;
-                type = AST_BAST;
+        bastmode = lkb->lkb_bastmode;
-                bmode = lkb->lkb_bastmode;
+        castmode = lkb->lkb_castmode;
+        /* when both are queued figure out which to do first and
+           switch first so the other goes in the next read */
+        if (do_cast && do_bast) {
+                if (lkb->lkb_ast_first == AST_COMP) {
+                        ret_type = AST_COMP;
+                        ret_mode = castmode;
+                        lkb->lkb_ast_type &= ~AST_COMP;
+                        lkb->lkb_ast_first = AST_BAST;
+                } else {
+                        ret_type = AST_BAST;
+                        ret_mode = bastmode;
+                        lkb->lkb_ast_type &= ~AST_BAST;
+                        lkb->lkb_ast_first = AST_COMP;
+                }
+        } else {
+                ret_type = lkb->lkb_ast_first;
+                ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
+                lkb->lkb_ast_type &= ~ret_type;
+                lkb->lkb_ast_first = 0;
+        }
+        /* if we're doing a bast but the bast is unnecessary, then
+           switch to do nothing or do a cast if that was needed next */
+        if ((ret_type == AST_BAST) &&
+            dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) {
+                ret_type = 0;
+                ret_mode = 0;
+                if (do_cast) {
+                        ret_type = AST_COMP;
+                        ret_mode = castmode;
+                        lkb->lkb_ast_type &= ~AST_COMP;
+                        lkb->lkb_ast_first = 0;
+                }
+        }
+        if (lkb->lkb_ast_first != lkb->lkb_ast_type) {
+                log_print("device_read %x ast_first %x ast_type %x",
+                          lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
        }
        if (!lkb->lkb_ast_type) {
@@ -869,15 +916,29 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        }
        spin_unlock(&proc->asts_spin);
-        error = copy_result_to_user(lkb->lkb_ua,
+        if (ret_type) {
-                                test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                error = copy_result_to_user(lkb->lkb_ua,
-                                type, bmode, buf, count);
+                                test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                                ret_type, ret_mode, buf, count);
+                if (ret_type == AST_COMP)
+                        lkb->lkb_castmode_done = castmode;
+                if (ret_type == AST_BAST)
+                        lkb->lkb_bastmode_done = bastmode;
+        }
        /* removes reference for the proc->asts lists added by
           dlm_user_add_ast() and may result in the lkb being freed */
        if (removed)
                dlm_put_lkb(lkb);
+        /* the bast that was queued was eliminated (see unnecessary above),
+           leaving nothing to return */
+        if (!ret_type)
+                goto try_another;
        return error;
 }
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 7cb0a59f4b9d..1cc087635a5e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
 #include <linux/crypto.h>
 #include <linux/file.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
@@ -381,8 +382,8 @@ out:
 static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
                                             struct ecryptfs_crypt_stat *crypt_stat)
 {
-        (*offset) = (crypt_stat->num_header_bytes_at_front
+        (*offset) = ecryptfs_lower_header_size(crypt_stat)
-                     + (crypt_stat->extent_size * extent_num));
+                    + (crypt_stat->extent_size * extent_num);
 }
 /**
@@ -834,13 +835,13 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
        set_extent_mask_and_shift(crypt_stat);
        crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
        if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-                crypt_stat->num_header_bytes_at_front = 0;
+                crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
        else {
                if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
-                        crypt_stat->num_header_bytes_at_front =
+                        crypt_stat->metadata_size =
                                ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
                else
-                        crypt_stat->num_header_bytes_at_front = PAGE_CACHE_SIZE;
+                        crypt_stat->metadata_size = PAGE_CACHE_SIZE;
        }
 }
@@ -1107,9 +1108,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written)
        (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
 }
-static void
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
-write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat,
+                                     struct ecryptfs_crypt_stat *crypt_stat,
-                     size_t *written)
+                                     size_t *written)
 {
        u32 flags = 0;
        int i;
@@ -1237,8 +1238,7 @@ ecryptfs_write_header_metadata(char *virt,
        header_extent_size = (u32)crypt_stat->extent_size;
        num_header_extents_at_front =
-                (u16)(crypt_stat->num_header_bytes_at_front
+                (u16)(crypt_stat->metadata_size / crypt_stat->extent_size);
-                      / crypt_stat->extent_size);
        put_unaligned_be32(header_extent_size, virt);
        virt += 4;
        put_unaligned_be16(num_header_extents_at_front, virt);
@@ -1291,7 +1291,8 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
        offset = ECRYPTFS_FILE_SIZE_BYTES;
        write_ecryptfs_marker((page_virt + offset), &written);
        offset += written;
-        write_ecryptfs_flags((page_virt + offset), crypt_stat, &written);
+        ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat,
+                                        &written);
        offset += written;
        ecryptfs_write_header_metadata((page_virt + offset), crypt_stat,
                                       &written);
@@ -1381,7 +1382,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
                rc = -EINVAL;
                goto out;
        }
-        virt_len = crypt_stat->num_header_bytes_at_front;
+        virt_len = crypt_stat->metadata_size;
        order = get_order(virt_len);
        /* Released in this function */
        virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
@@ -1427,16 +1428,15 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
        header_extent_size = get_unaligned_be32(virt);
        virt += sizeof(__be32);
        num_header_extents_at_front = get_unaligned_be16(virt);
-        crypt_stat->num_header_bytes_at_front =
+        crypt_stat->metadata_size = (((size_t)num_header_extents_at_front
-                (((size_t)num_header_extents_at_front
+                                     * (size_t)header_extent_size));
-                  * (size_t)header_extent_size));
        (*bytes_read) = (sizeof(__be32) + sizeof(__be16));
        if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
-            && (crypt_stat->num_header_bytes_at_front
+            && (crypt_stat->metadata_size
                < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
                rc = -EINVAL;
                printk(KERN_WARNING "Invalid header size: [%zd]\n",
-                       crypt_stat->num_header_bytes_at_front);
+                       crypt_stat->metadata_size);
        }
        return rc;
 }
@@ -1451,8 +1451,7 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
 */
 static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
 {
-        crypt_stat->num_header_bytes_at_front =
+        crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
-                ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
 }
 /**
@@ -1606,6 +1605,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
                                                ecryptfs_dentry,
                                                ECRYPTFS_VALIDATE_HEADER_SIZE);
        if (rc) {
+                memset(page_virt, 0, PAGE_CACHE_SIZE);
                rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
                if (rc) {
                        printk(KERN_DEBUG "Valid eCryptfs headers not found in "
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 8f006a0d6076..906e803f7f79 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -26,6 +26,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 /**
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 542f625312f3..bfc2e0f78f00 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -35,6 +35,7 @@
 #include <linux/scatterlist.h>
 #include <linux/hash.h>
 #include <linux/nsproxy.h>
+#include <linux/backing-dev.h>
 /* Version verification for shared data structures w/ userspace */
 #define ECRYPTFS_VERSION_MAJOR 0x00
@@ -273,7 +274,7 @@ struct ecryptfs_crypt_stat {
        u32 flags;
        unsigned int file_version;
        size_t iv_bytes;
-        size_t num_header_bytes_at_front;
+        size_t metadata_size;
        size_t extent_size; /* Data extent size; default is 4096 */
        size_t key_size;
        size_t extent_shift;
@@ -393,6 +394,7 @@ struct ecryptfs_mount_crypt_stat {
 struct ecryptfs_sb_info {
        struct super_block *wsi_sb;
        struct ecryptfs_mount_crypt_stat mount_crypt_stat;
+        struct backing_dev_info bdi;
 };
 /* file private data. */
@@ -464,6 +466,14 @@ struct ecryptfs_daemon {
 extern struct mutex ecryptfs_daemon_hash_mux;
+static inline size_t
+ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
+{
+        if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+                return 0;
+        return crypt_stat->metadata_size;
+}
 static inline struct ecryptfs_file_info *
 ecryptfs_file_to_private(struct file *file)
 {
@@ -651,6 +661,9 @@ int ecryptfs_decrypt_page(struct page *page);
 int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry);
 int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
 int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
+                                     struct ecryptfs_crypt_stat *crypt_stat,
+                                     size_t *written);
 int ecryptfs_read_and_validate_header_region(char *data,
                                             struct inode *ecryptfs_inode);
 int ecryptfs_read_and_validate_xattr_region(char *page_virt,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 678172b61be2..e7440a6f5ebf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -25,6 +25,7 @@
 #include <linux/file.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/security.h>
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4a430ab4115c..e2d4418affac 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/mount.h>
 #include <linux/crypto.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
@@ -323,6 +324,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
        rc = ecryptfs_read_and_validate_header_region(page_virt,
                                                      ecryptfs_dentry->d_inode);
        if (rc) {
+                memset(page_virt, 0, PAGE_CACHE_SIZE);
                rc = ecryptfs_read_and_validate_xattr_region(page_virt,
                                                             ecryptfs_dentry);
                if (rc) {
@@ -335,7 +337,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                ecryptfs_dentry->d_sb)->mount_crypt_stat;
        if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
                if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-                        file_size = (crypt_stat->num_header_bytes_at_front
+                        file_size = (crypt_stat->metadata_size
                                     + i_size_read(lower_dentry->d_inode));
                else
                        file_size = i_size_read(lower_dentry->d_inode);
@@ -387,9 +389,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
-                       "lower_dentry = [%s]\n", __func__, rc,
+                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
-                       ecryptfs_dentry->d_name.name);
+                                encrypted_and_encoded_name);
                goto out_d_drop;
        }
        if (lower_dentry->d_inode)
@@ -416,9 +418,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
-                       "lower_dentry = [%s]\n", __func__, rc,
+                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
-                       encrypted_and_encoded_name);
+                                encrypted_and_encoded_name);
                goto out_d_drop;
        }
 lookup_and_interpose:
@@ -455,8 +457,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
        rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
        if (rc)
                goto out_lock;
-        fsstack_copy_attr_times(dir, lower_new_dentry->d_inode);
+        fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
-        fsstack_copy_inode_size(dir, lower_new_dentry->d_inode);
+        fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
        old_dentry->d_inode->i_nlink =
                ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
        i_size_write(new_dentry->d_inode, file_size_save);
@@ -647,38 +649,17 @@ out_lock:
        return rc;
 }
-static int
+static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
-ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+                                   size_t *bufsiz)
 {
+        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
        char *lower_buf;
-        size_t lower_bufsiz;
+        size_t lower_bufsiz = PATH_MAX;
-        struct dentry *lower_dentry;
-        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
-        char *plaintext_name;
-        size_t plaintext_name_size;
        mm_segment_t old_fs;
        int rc;
-        lower_dentry = ecryptfs_dentry_to_lower(dentry);
-        if (!lower_dentry->d_inode->i_op->readlink) {
-                rc = -EINVAL;
-                goto out;
-        }
-        mount_crypt_stat = &ecryptfs_superblock_to_private(
-                                                dentry->d_sb)->mount_crypt_stat;
-        /*
-         * If the lower filename is encrypted, it will result in a significantly
-         * longer name.  If needed, truncate the name after decode and decrypt.
-         */
-        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
-                lower_bufsiz = PATH_MAX;
-        else
-                lower_bufsiz = bufsiz;
-        /* Released in this function */
        lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
-        if (lower_buf == NULL) {
+        if (!lower_buf) {
-                printk(KERN_ERR "%s: Out of memory whilst attempting to "
-                       "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
                rc = -ENOMEM;
                goto out;
        }
@@ -688,29 +669,31 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
                                                   (char __user *)lower_buf,
                                                   lower_bufsiz);
        set_fs(old_fs);
-        if (rc >= 0) {
+        if (rc < 0)
-                rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
+                goto out;
-                                                          &plaintext_name_size,
+        lower_bufsiz = rc;
-                                                          dentry, lower_buf,
+        rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
-                                                          rc);
+                                                  lower_buf, lower_bufsiz);
-                if (rc) {
+out:
-                        printk(KERN_ERR "%s: Error attempting to decode and "
-                               "decrypt filename; rc = [%d]\n", __func__,
-                                rc);
-                        goto out_free_lower_buf;
-                }
-                /* Check for bufsiz <= 0 done in sys_readlinkat() */
-                rc = copy_to_user(buf, plaintext_name,
-                                  min((size_t) bufsiz, plaintext_name_size));
-                if (rc)
-                        rc = -EFAULT;
-                else
-                        rc = plaintext_name_size;
-                kfree(plaintext_name);
-                fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
-        }
-out_free_lower_buf:
        kfree(lower_buf);
+        return rc;
+}
+static int
+ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+        char *kbuf;
+        size_t kbufsiz, copied;
+        int rc;
+        rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
+        if (rc)
+                goto out;
+        copied = min_t(size_t, bufsiz, kbufsiz);
+        rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
+        kfree(kbuf);
+        fsstack_copy_attr_atime(dentry->d_inode,
+                                ecryptfs_dentry_to_lower(dentry)->d_inode);
 out:
        return rc;
 }
@@ -768,7 +751,7 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
 {
        loff_t lower_size;
-        lower_size = crypt_stat->num_header_bytes_at_front;
+        lower_size = ecryptfs_lower_header_size(crypt_stat);
        if (upper_size != 0) {
                loff_t num_extents;
@@ -1015,6 +998,28 @@ out:
        return rc;
 }
+int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
+                          struct kstat *stat)
+{
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+        int rc = 0;
+        mount_crypt_stat = &ecryptfs_superblock_to_private(
+                                                dentry->d_sb)->mount_crypt_stat;
+        generic_fillattr(dentry->d_inode, stat);
+        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+                char *target;
+                size_t targetsiz;
+                rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
+                if (!rc) {
+                        kfree(target);
+                        stat->size = targetsiz;
+                }
+        }
+        return rc;
+}
 int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
                     struct kstat *stat)
 {
@@ -1039,7 +1044,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        if (!lower_dentry->d_inode->i_op->setxattr) {
-                rc = -ENOSYS;
+                rc = -EOPNOTSUPP;
                goto out;
        }
        mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1057,7 +1062,7 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
        int rc = 0;
        if (!lower_dentry->d_inode->i_op->getxattr) {
-                rc = -ENOSYS;
+                rc = -EOPNOTSUPP;
                goto out;
        }
        mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1084,7 +1089,7 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        if (!lower_dentry->d_inode->i_op->listxattr) {
-                rc = -ENOSYS;
+                rc = -EOPNOTSUPP;
                goto out;
        }
        mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1101,7 +1106,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        if (!lower_dentry->d_inode->i_op->removexattr) {
-                rc = -ENOSYS;
+                rc = -EOPNOTSUPP;
                goto out;
        }
        mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1132,6 +1137,7 @@ const struct inode_operations ecryptfs_symlink_iops = {
        .put_link = ecryptfs_put_link,
        .permission = ecryptfs_permission,
        .setattr = ecryptfs_setattr,
+        .getattr = ecryptfs_getattr_link,
        .setxattr = ecryptfs_setxattr,
        .getxattr = ecryptfs_getxattr,
        .listxattr = ecryptfs_listxattr,
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index a0a7847567e9..89c5476506ef 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -32,6 +32,7 @@
 #include <linux/random.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 /**
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e14cf7e588db..d8c3a373aafa 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -22,6 +22,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/mount.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index ea2f92101dfe..760983d0f25e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
 #include <linux/key.h>
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -496,17 +497,25 @@ struct kmem_cache *ecryptfs_sb_info_cache;
 static int
 ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
 {
+        struct ecryptfs_sb_info *esi;
        int rc = 0;
        /* Released in ecryptfs_put_super() */
        ecryptfs_set_superblock_private(sb,
                                        kmem_cache_zalloc(ecryptfs_sb_info_cache,
                                                         GFP_KERNEL));
-        if (!ecryptfs_superblock_to_private(sb)) {
+        esi = ecryptfs_superblock_to_private(sb);
+        if (!esi) {
                ecryptfs_printk(KERN_WARNING, "Out of memory\n");
                rc = -ENOMEM;
                goto out;
        }
+        rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
+        if (rc)
+                goto out;
+        sb->s_bdi = &esi->bdi;
        sb->s_op = &ecryptfs_sops;
        /* Released through deactivate_super(sb) from get_sb_nodev */
        sb->s_root = d_alloc(NULL, &(const struct qstr) {
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index f1c17e87c5fb..2d8dbce9d485 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -20,6 +20,7 @@
 * 02111-1307, USA.
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/user_namespace.h>
 #include <linux/nsproxy.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 4ec8f61ccf5a..3745f612bcd4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -24,6 +24,7 @@
 #include <linux/random.h>
 #include <linux/miscdevice.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/module.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index df4ce99d0597..2ee9a3a7b68c 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
@@ -82,6 +83,19 @@ out:
        return rc;
 }
+static void strip_xattr_flag(char *page_virt,
+                             struct ecryptfs_crypt_stat *crypt_stat)
+{
+        if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
+                size_t written;
+                crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR;
+                ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat,
+                                                &written);
+                crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+        }
+}
 /**
 *   Header Extent:
 *     Octets 0-7:        Unencrypted file size (big-endian)
@@ -97,19 +111,6 @@ out:
 *                        (big-endian)
 *     Octet  26:         Begin RFC 2440 authentication token packet set
 */
-static void set_header_info(char *page_virt,
-                            struct ecryptfs_crypt_stat *crypt_stat)
-{
-        size_t written;
-        size_t save_num_header_bytes_at_front =
-                crypt_stat->num_header_bytes_at_front;
-        crypt_stat->num_header_bytes_at_front =
-                ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
-        ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written);
-        crypt_stat->num_header_bytes_at_front =
-                save_num_header_bytes_at_front;
-}
 /**
 * ecryptfs_copy_up_encrypted_with_header
@@ -135,8 +136,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
                                           * num_extents_per_page)
                                          + extent_num_in_page);
                size_t num_header_extents_at_front =
-                        (crypt_stat->num_header_bytes_at_front
+                        (crypt_stat->metadata_size / crypt_stat->extent_size);
-                         / crypt_stat->extent_size);
                if (view_extent_num < num_header_extents_at_front) {
                        /* This is a header extent */
@@ -146,9 +146,14 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
                        memset(page_virt, 0, PAGE_CACHE_SIZE);
                        /* TODO: Support more than one header extent */
                        if (view_extent_num == 0) {
+                                size_t written;
                                rc = ecryptfs_read_xattr_region(
                                        page_virt, page->mapping->host);
-                                set_header_info(page_virt, crypt_stat);
+                                strip_xattr_flag(page_virt + 16, crypt_stat);
+                                ecryptfs_write_header_metadata(page_virt + 20,
+                                                               crypt_stat,
+                                                               &written);
                        }
                        kunmap_atomic(page_virt, KM_USER0);
                        flush_dcache_page(page);
@@ -161,7 +166,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
                        /* This is an encrypted data extent */
                        loff_t lower_offset =
                                ((view_extent_num * crypt_stat->extent_size)
-                                 - crypt_stat->num_header_bytes_at_front);
+                                 - crypt_stat->metadata_size);
                        rc = ecryptfs_read_lower_page_segment(
                                page, (lower_offset >> PAGE_CACHE_SHIFT),
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b15a43a80ab7..0c0ae491d231 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/key.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/smp_lock.h>
 #include <linux/file.h>
@@ -85,7 +86,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
                if (lower_dentry->d_inode) {
                        fput(inode_info->lower_file);
                        inode_info->lower_file = NULL;
-                        d_drop(lower_dentry);
                }
        }
        ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
@@ -122,6 +122,7 @@ static void ecryptfs_put_super(struct super_block *sb)
        lock_kernel();
        ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
+        bdi_destroy(&sb_info->bdi);
        kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
        ecryptfs_set_superblock_private(sb, NULL);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 7758cc382ef0..6bd3f76fdf88 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -11,6 +11,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5b4efc..3817149919cb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
                 * ep_poll_callback() when events will become available.
                 */
                init_waitqueue_entry(&wait, current);
-                wait.flags |= WQ_FLAG_EXCLUSIVE;
+                __add_wait_queue_exclusive(&ep->wq, &wait);
-                __add_wait_queue(&ep->wq, &wait);
                for (;;) {
                        /*
diff --git a/fs/exec.c b/fs/exec.c
index 49cdaa19e5b9..e6e94c626c2c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1387,8 +1387,6 @@ int do_execve(char * filename,
        if (retval < 0)
                goto out;
-        current->stack_start = current->mm->start_stack;
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 8442e353309f..22721b2fd890 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -35,6 +35,7 @@
 #include <linux/fs.h>
 #include <linux/time.h>
+#include <linux/backing-dev.h>
 #include "common.h"
 /* FIXME: Remove once pnfs hits mainline
@@ -84,6 +85,7 @@ struct exofs_sb_info {
        u32             s_next_generation;      /* next gen # to use          */
        atomic_t        s_curr_pending;         /* number of pending commands */
        uint8_t         s_cred[OSD_CAP_LEN];    /* credential for the fscb    */
+        struct          backing_dev_info bdi;   /* register our bdi with VFS  */
        struct pnfs_osd_data_map data_map;      /* Default raid to use
                                                 * FIXME: Needed ?
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a17e4b733e35..76d2a79ef93e 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -31,6 +31,7 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
+#include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <scsi/scsi_device.h>
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 5293bc411d17..4337cad7777b 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -22,6 +22,7 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
+#include <linux/slab.h>
 #include <scsi/scsi_device.h>
 #include <asm/div64.h>
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 6cf5e4e84d61..03149b9a5178 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -37,6 +37,7 @@
 #include <linux/vfs.h>
 #include <linux/random.h>
 #include <linux/exportfs.h>
+#include <linux/slab.h>
 #include "exofs.h"
@@ -301,6 +302,7 @@ static void exofs_put_super(struct super_block *sb)
        _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
                            sbi->layout.s_pid);
+        bdi_destroy(&sbi->bdi);
        exofs_free_sbi(sbi);
        sb->s_fs_info = NULL;
 }
@@ -545,6 +547,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sbi)
                return -ENOMEM;
+        ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+        if (ret)
+                goto free_bdi;
        /* use mount options to fill superblock */
        od = osduld_path_lookup(opts->dev_name);
        if (IS_ERR(od)) {
@@ -611,6 +617,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        }
        /* set up operation vectors */
+        sb->s_bdi = &sbi->bdi;
        sb->s_fs_info = sbi;
        sb->s_op = &exofs_sops;
        sb->s_export_op = &exofs_export_ops;
@@ -642,6 +649,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 free_sbi:
+        bdi_destroy(&sbi->bdi);
+free_bdi:
        EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
                  opts->dev_name, sbi->layout.s_pid, ret);
        exofs_free_sbi(sbi);
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 1d081f0cfec2..3cf038c055d7 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -13,6 +13,7 @@
 #include "ext2.h"
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 4e2426e22bbe..565cf817bbf1 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -32,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .setattr        = ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
@@ -43,6 +44,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
 const struct inode_operations ext2_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = ext2_follow_link,
+        .setattr        = ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c8155845ac05..b118c6383c6d 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -4,6 +4,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/ext2_fs.h>
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 161da2d3f890..a177122a1b25 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -14,6 +14,7 @@
 #include <linux/time.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index ef9008b885b5..0d0e97ed3ff6 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -582,7 +582,9 @@ got:
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
-        ei->i_state = EXT3_STATE_NEW;
+        ei->i_state_flags = 0;
+        ext3_set_inode_state(inode, EXT3_STATE_NEW);
        ei->i_extra_isize =
                (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
                sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 7f920b7263a4..ea33bdf0a300 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2811,7 +2811,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
        inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
        inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
-        ei->i_state = 0;
+        ei->i_state_flags = 0;
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e844accbf55d..1bee604cc6cd 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -164,7 +164,7 @@ void ext3_msg(struct super_block *sb, const char *prefix,
 * write out the superblock safely.
 *
 * We'll just use the journal_abort() error code to record an error in
- * the journal instead.  On recovery, the journal will compain about
+ * the journal instead.  On recovery, the journal will complain about
 * that error until we've noted it down and cleared it.
 */
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index ff7b4ccd8983..7c4898207776 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .setattr        = ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
 const struct inode_operations ext3_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = ext3_follow_link,
+        .setattr        = ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 474348788dd9..3af91f476dff 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -4,6 +4,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/ext3_jbd.h>
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 983f0e127493..538c48655084 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -18,6 +18,7 @@
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include "ext4.h"
 struct ext4_system_zone {
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 94c8ee81f5e1..236b834b4ca8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3879,6 +3879,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
                physical += offset;
                length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
                flags |= FIEMAP_EXTENT_DATA_INLINE;
+                brelse(iloc.bh);
        } else { /* external block */
                physical = EXT4_I(inode)->i_file_acl << blockbits;
                length = inode->i_sb->s_blocksize;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 361c0b9962a8..57f6eef6ccd6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -263,7 +263,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                                        ext4_group_t f;
                                        f = ext4_flex_group(sbi, block_group);
-                                        atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+                                        atomic_dec(&sbi->s_flex_groups[f].used_dirs);
                                }
                        }
@@ -773,7 +773,7 @@ static int ext4_claim_inode(struct super_block *sb,
                if (sbi->s_log_groups_per_flex) {
                        ext4_group_t f = ext4_flex_group(sbi, group);
-                        atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+                        atomic_inc(&sbi->s_flex_groups[f].used_dirs);
                }
        }
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986120f30066..81d605412844 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,6 +39,7 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -1035,7 +1036,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
                                              sector_t lblock)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
-        int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
+        sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
        int blk_bits;
        if (lblock < EXT4_NDIR_BLOCKS)
@@ -1050,7 +1051,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
        }
        ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
        ei->i_da_metadata_calc_len = 1;
-        blk_bits = roundup_pow_of_two(lblock + 1);
+        blk_bits = order_base_2(lblock);
        return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
@@ -5374,7 +5375,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
        } else {
                struct ext4_iloc iloc;
-                err = ext4_get_inode_loc(inode, &iloc);
+                err = __ext4_get_inode_loc(inode, &iloc, 0);
                if (err)
                        return err;
                if (wbc->sync_mode == WB_SYNC_ALL)
@@ -5385,6 +5386,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
                                   (unsigned long long)iloc.bh->b_blocknr);
                        err = -EIO;
                }
+                brelse(iloc.bh);
        }
        return err;
 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 506713a2ebd8..b423a364dca3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,6 +23,7 @@
 #include "mballoc.h"
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 #include <trace/events/ext4.h>
 /*
@@ -69,7 +70,7 @@
 *
 * pa_lstart -> the logical start block for this prealloc space
 * pa_pstart -> the physical start block for this prealloc space
- * pa_len    -> lenght for this prealloc space
+ * pa_len    -> length for this prealloc space
 * pa_free   ->  free space available in this prealloc space
 *
 * The inode preallocation space is used looking at the _logical_ start
@@ -2534,6 +2535,17 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
+                if (test_opt(sb, DISCARD)) {
+                        ext4_fsblk_t discard_block;
+                        discard_block = entry->start_blk +
+                                ext4_group_first_block_no(sb, entry->group);
+                        trace_ext4_discard_blocks(sb,
+                                        (unsigned long long)discard_block,
+                                        entry->count);
+                        sb_issue_discard(sb, discard_block, entry->count);
+                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
                BUG_ON(err != 0);
@@ -2555,16 +2567,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                        page_cache_release(e4b.bd_bitmap_page);
                }
                ext4_unlock_group(sb, entry->group);
-                if (test_opt(sb, DISCARD)) {
-                        ext4_fsblk_t discard_block;
-                        discard_block = entry->start_blk +
-                                ext4_group_first_block_no(sb, entry->group);
-                        trace_ext4_discard_blocks(sb,
-                                        (unsigned long long)discard_block,
-                                        entry->count);
-                        sb_issue_discard(sb, discard_block, entry->count);
-                }
                kmem_cache_free(ext4_free_ext_cachep, entry);
                ext4_mb_release_desc(&e4b);
        }
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 8b87bd0eac95..34dcfc52ef44 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -13,6 +13,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index aa5fe28d180f..d1fc662cc311 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 #include "ext4.h"
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2b83b96cb2eb..e14d22c170d5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -68,7 +68,21 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+                       const char *dev_name, void *data, struct vfsmount *mnt);
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ext3",
+        .get_sb         = ext4_get_sb,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#else
+#define IS_EXT3_SB(sb) (0)
+#endif
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                               struct ext4_group_desc *bg)
@@ -302,7 +316,7 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
 * write out the superblock safely.
 *
 * We'll just use the jbd2_journal_abort() error code to record an error in
- * the journal instead.  On recovery, the journal will compain about
+ * the journal instead.  On recovery, the journal will complain about
 * that error until we've noted it down and cleared it.
 */
@@ -2358,7 +2372,7 @@ static void ext4_sb_release(struct kobject *kobj)
 }
-static struct sysfs_ops ext4_attr_ops = {
+static const struct sysfs_ops ext4_attr_ops = {
        .show   = ext4_attr_show,
        .store  = ext4_attr_store,
 };
@@ -2539,7 +2553,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * enable delayed allocation by default
         * Use -o nodelalloc to turn it off
         */
-        set_opt(sbi->s_mount_opt, DELALLOC);
+        if (!IS_EXT3_SB(sb))
+                set_opt(sbi->s_mount_opt, DELALLOC);
        if (!parse_options((char *) data, sb, &journal_devnum,
                           &journal_ioprio, NULL, 0))
@@ -4068,7 +4083,7 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
-#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext2",
@@ -4095,15 +4110,7 @@ static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
 #endif
-#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext3_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "ext3",
-        .get_sb         = ext4_get_sb,
-        .kill_sb        = kill_block_super,
-        .fs_flags       = FS_REQUIRES_DEV,
-};
 static inline void register_as_ext3(void)
 {
        int err = register_filesystem(&ext3_fs_type);
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 983c253999a7..8b145e98df07 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -7,6 +7,7 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "xattr.h"
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 923990e4f16e..113f0a1e565d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,6 +9,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include "fat.h"
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index fbeecdc194dc..0ce143bd7d56 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -558,7 +558,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = sbi->free_clusters;
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
-        buf->f_namelen = sbi->options.isvfat ? 260 : 12;
+        buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12;
        return 0;
 }
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f565f24019b5..6fcc7e71fbaa 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
 {
        struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
        wchar_t *ip, *ext_start, *end, *name_start;
-        unsigned char base[9], ext[4], buf[8], *p;
+        unsigned char base[9], ext[4], buf[5], *p;
        unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
        int chl, chi;
        int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
                        return 0;
        }
-        i = jiffies & 0xffff;
+        i = jiffies;
        sz = (jiffies >> 16) & 0x7;
        if (baselen > 2) {
                baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
        name_res[baselen + 4] = '~';
        name_res[baselen + 5] = '1' + sz;
        while (1) {
-                sprintf(buf, "%04X", i);
+                snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
                memcpy(&name_res[baselen], buf, 4);
                if (vfat_find_form(dir, name_res) < 0)
                        break;
@@ -502,14 +502,14 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
                *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
                if (*outlen < 0)
                        return *outlen;
-                else if (*outlen > 255)
+                else if (*outlen > FAT_LFN_LEN)
                        return -ENAMETOOLONG;
                op = &outname[*outlen * sizeof(wchar_t)];
        } else {
                if (nls) {
                        for (i = 0, ip = name, op = outname, *outlen = 0;
-                             i < len && *outlen <= 255;
+                             i < len && *outlen <= FAT_LFN_LEN;
                             *outlen += 1)
                        {
                                if (escape && (*ip == ':')) {
@@ -549,7 +549,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
                                return -ENAMETOOLONG;
                } else {
                        for (i = 0, ip = name, op = outname, *outlen = 0;
-                             i < len && *outlen <= 255;
+                             i < len && *outlen <= FAT_LFN_LEN;
                             i++, *outlen += 1)
                        {
                                *op++ = *ip++;
@@ -701,6 +701,15 @@ static int vfat_find(struct inode *dir, struct qstr *qname,
        return fat_search_long(dir, qname->name, len, sinfo);
 }
+/*
+ * (nfsd's) anonymous disconnected dentry?
+ * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job).
+ */
+static int vfat_d_anon_disconn(struct dentry *dentry)
+{
+        return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
+}
 static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
                                  struct nameidata *nd)
 {
@@ -729,11 +738,11 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
        }
        alias = d_find_alias(inode);
-        if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
+        if (alias && !vfat_d_anon_disconn(alias)) {
                /*
-                 * This inode has non DCACHE_DISCONNECTED dentry. This
+                 * This inode has non anonymous-DCACHE_DISCONNECTED
-                 * means, the user did ->lookup() by an another name
+                 * dentry. This means, the user did ->lookup() by an
-                 * (longname vs 8.3 alias of it) in past.
+                 * another name (longname vs 8.3 alias of it) in past.
                 *
                 * Switch to new one for reason of locality if possible.
                 */
@@ -743,7 +752,9 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
                iput(inode);
                unlock_super(sb);
                return alias;
-        }
+        } else
+                dput(alias);
 out:
        unlock_super(sb);
        dentry->d_op = sb->s_root->d_op;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f9075e..0a140741b39e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -614,9 +614,15 @@ int send_sigurg(struct fown_struct *fown)
        return ret;
 }
-static DEFINE_RWLOCK(fasync_lock);
+static DEFINE_SPINLOCK(fasync_lock);
 static struct kmem_cache *fasync_cache __read_mostly;
+static void fasync_free_rcu(struct rcu_head *head)
+{
+        kmem_cache_free(fasync_cache,
+                        container_of(head, struct fasync_struct, fa_rcu));
+}
 /*
 * Remove a fasync entry. If successfully removed, return
 * positive and clear the FASYNC flag. If no entry exists,
@@ -625,8 +631,6 @@ static struct kmem_cache *fasync_cache __read_mostly;
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 *
- * We always take the 'filp->f_lock', in since fasync_lock
- * needs to be irq-safe.
 */
 static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 {
@@ -634,17 +638,22 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
        int result = 0;
        spin_lock(&filp->f_lock);
-        write_lock_irq(&fasync_lock);
+        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;
+                spin_lock_irq(&fa->fa_lock);
+                fa->fa_file = NULL;
+                spin_unlock_irq(&fa->fa_lock);
                *fp = fa->fa_next;
-                kmem_cache_free(fasync_cache, fa);
+                call_rcu(&fa->fa_rcu, fasync_free_rcu);
                filp->f_flags &= ~FASYNC;
                result = 1;
                break;
        }
-        write_unlock_irq(&fasync_lock);
+        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
 }
@@ -666,25 +675,30 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
                return -ENOMEM;
        spin_lock(&filp->f_lock);
-        write_lock_irq(&fasync_lock);
+        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;
+                spin_lock_irq(&fa->fa_lock);
                fa->fa_fd = fd;
+                spin_unlock_irq(&fa->fa_lock);
                kmem_cache_free(fasync_cache, new);
                goto out;
        }
+        spin_lock_init(&new->fa_lock);
        new->magic = FASYNC_MAGIC;
        new->fa_file = filp;
        new->fa_fd = fd;
        new->fa_next = *fapp;
-        *fapp = new;
+        rcu_assign_pointer(*fapp, new);
        result = 1;
        filp->f_flags |= FASYNC;
 out:
-        write_unlock_irq(&fasync_lock);
+        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
 }
@@ -704,37 +718,41 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
 EXPORT_SYMBOL(fasync_helper);
-void __kill_fasync(struct fasync_struct *fa, int sig, int band)
+/*
+ * rcu_read_lock() is held
+ */
+static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
 {
        while (fa) {
-                struct fown_struct * fown;
+                struct fown_struct *fown;
                if (fa->magic != FASYNC_MAGIC) {
                        printk(KERN_ERR "kill_fasync: bad magic number in "
                               "fasync_struct!\n");
                        return;
                }
-                fown = &fa->fa_file->f_owner;
+                spin_lock(&fa->fa_lock);
-                /* Don't send SIGURG to processes which have not set a
+                if (fa->fa_file) {
-                   queued signum: SIGURG has its own default signalling
+                        fown = &fa->fa_file->f_owner;
-                   mechanism. */
+                        /* Don't send SIGURG to processes which have not set a
-                if (!(sig == SIGURG && fown->signum == 0))
+                           queued signum: SIGURG has its own default signalling
-                        send_sigio(fown, fa->fa_fd, band);
+                           mechanism. */
-                fa = fa->fa_next;
+                        if (!(sig == SIGURG && fown->signum == 0))
+                                send_sigio(fown, fa->fa_fd, band);
+                }
+                spin_unlock(&fa->fa_lock);
+                fa = rcu_dereference(fa->fa_next);
        }
 }
-EXPORT_SYMBOL(__kill_fasync);
 void kill_fasync(struct fasync_struct **fp, int sig, int band)
 {
        /* First a quick test without locking: usually
         * the list is empty.
         */
        if (*fp) {
-                read_lock(&fasync_lock);
+                rcu_read_lock();
-                /* reread *fp after obtaining the lock */
+                kill_fasync_rcu(rcu_dereference(*fp), sig, band);
-                __kill_fasync(*fp, sig, band);
+                rcu_read_unlock();
-                read_unlock(&fasync_lock);
        }
 }
 EXPORT_SYMBOL(kill_fasync);
diff --git a/fs/fifo.c b/fs/fifo.c
index f8f97b8b6d44..5d6606ffc2d2 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -10,7 +10,6 @@
 */
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/pipe_fs_i.h>
diff --git a/fs/filesystems.c b/fs/filesystems.c
index a24c58e181db..68ba492d8eef 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -10,10 +10,10 @@
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 /*
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index ed8f0b0dd880..1429f3ae1e86 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -33,7 +33,6 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include "vxfs_extern.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 76fc4d594acb..4b37f7cea4dd 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
@@ -553,108 +554,85 @@ select_queue:
        return ret;
 }
-static void unpin_sb_for_writeback(struct super_block **psb)
+static void unpin_sb_for_writeback(struct super_block *sb)
 {
-        struct super_block *sb = *psb;
+        up_read(&sb->s_umount);
+        put_super(sb);
-        if (sb) {
-                up_read(&sb->s_umount);
-                put_super(sb);
-                *psb = NULL;
-        }
 }
+enum sb_pin_state {
+        SB_PINNED,
+        SB_NOT_PINNED,
+        SB_PIN_FAILED
+};
 /*
 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
 * before calling writeback. So make sure that we do pin it, so it doesn't
 * go away while we are writing inodes from it.
- *
- * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
- * 1 if we failed.
 */
-static int pin_sb_for_writeback(struct writeback_control *wbc,
+static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
-                                struct inode *inode, struct super_block **psb)
+                                              struct super_block *sb)
 {
-        struct super_block *sb = inode->i_sb;
-        /*
-         * If this sb is already pinned, nothing more to do. If not and
-         * *psb is non-NULL, unpin the old one first
-         */
-        if (sb == *psb)
-                return 0;
-        else if (*psb)
-                unpin_sb_for_writeback(psb);
        /*
         * Caller must already hold the ref for this
         */
        if (wbc->sync_mode == WB_SYNC_ALL) {
                WARN_ON(!rwsem_is_locked(&sb->s_umount));
-                return 0;
+                return SB_NOT_PINNED;
        }
        spin_lock(&sb_lock);
        sb->s_count++;
        if (down_read_trylock(&sb->s_umount)) {
                if (sb->s_root) {
                        spin_unlock(&sb_lock);
-                        goto pinned;
+                        return SB_PINNED;
                }
                /*
                 * umounted, drop rwsem again and fall through to failure
                 */
                up_read(&sb->s_umount);
        }
        sb->s_count--;
        spin_unlock(&sb_lock);
-        return 1;
+        return SB_PIN_FAILED;
-pinned:
-        *psb = sb;
-        return 0;
 }
-static void writeback_inodes_wb(struct bdi_writeback *wb,
+/*
-                                struct writeback_control *wbc)
+ * Write a portion of b_io inodes which belong to @sb.
+ * If @wbc->sb != NULL, then find and write all such
+ * inodes. Otherwise write only ones which go sequentially
+ * in reverse order.
+ * Return 1, if the caller writeback routine should be
+ * interrupted. Otherwise return 0.
+ */
+static int writeback_sb_inodes(struct super_block *sb,
+                               struct bdi_writeback *wb,
+                               struct writeback_control *wbc)
 {
-        struct super_block *sb = wbc->sb, *pin_sb = NULL;
-        const unsigned long start = jiffies;    /* livelock avoidance */
-        spin_lock(&inode_lock);
-        if (!wbc->for_kupdate || list_empty(&wb->b_io))
-                queue_io(wb, wbc->older_than_this);
        while (!list_empty(&wb->b_io)) {
-                struct inode *inode = list_entry(wb->b_io.prev,
-                                                struct inode, i_list);
                long pages_skipped;
+                struct inode *inode = list_entry(wb->b_io.prev,
-                /*
+                                                 struct inode, i_list);
-                 * super block given and doesn't match, skip this inode
+                if (wbc->sb && sb != inode->i_sb) {
-                 */
+                        /* super block given and doesn't
-                if (sb && sb != inode->i_sb) {
+                           match, skip this inode */
                        redirty_tail(inode);
                        continue;
                }
+                if (sb != inode->i_sb)
+                        /* finish with this superblock */
+                        return 0;
                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
                }
                /*
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
                 */
-                if (inode_dirtied_after(inode, start))
+                if (inode_dirtied_after(inode, wbc->wb_start))
-                        break;
+                        return 1;
-                if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
-                        requeue_io(inode);
-                        continue;
-                }
                BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
                __iget(inode);
@@ -673,14 +651,50 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
                spin_lock(&inode_lock);
                if (wbc->nr_to_write <= 0) {
                        wbc->more_io = 1;
-                        break;
+                        return 1;
                }
                if (!list_empty(&wb->b_more_io))
                        wbc->more_io = 1;
        }
+        /* b_io is empty */
+        return 1;
+}
+static void writeback_inodes_wb(struct bdi_writeback *wb,
+                                struct writeback_control *wbc)
+{
+        int ret = 0;
-        unpin_sb_for_writeback(&pin_sb);
+        wbc->wb_start = jiffies; /* livelock avoidance */
+        spin_lock(&inode_lock);
+        if (!wbc->for_kupdate || list_empty(&wb->b_io))
+                queue_io(wb, wbc->older_than_this);
+        while (!list_empty(&wb->b_io)) {
+                struct inode *inode = list_entry(wb->b_io.prev,
+                                                 struct inode, i_list);
+                struct super_block *sb = inode->i_sb;
+                enum sb_pin_state state;
+                if (wbc->sb && sb != wbc->sb) {
+                        /* super block given and doesn't
+                           match, skip this inode */
+                        redirty_tail(inode);
+                        continue;
+                }
+                state = pin_sb_for_writeback(wbc, sb);
+                if (state == SB_PIN_FAILED) {
+                        requeue_io(inode);
+                        continue;
+                }
+                ret = writeback_sb_inodes(sb, wb, wbc);
+                if (state == SB_PINNED)
+                        unpin_sb_for_writeback(sb);
+                if (ret)
+                        break;
+        }
        spin_unlock(&inode_lock);
        /* Leave any unwritten inodes on b_io */
 }
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 864dac20a242..cc94bb9563f2 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
 config FSCACHE
        tristate "General filesystem local caching manager"
-        depends on EXPERIMENTAL
        select SLOW_WORK
        help
          This option enables a generic filesystem caching manager that can be
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 3221a0c7944e..1e1f286dd70e 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -12,6 +12,7 @@
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/key.h>
 #include <keys/user-type.h>
 #include "internal.h"
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index e513ac599c8e..0b589a9b4ffc 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -53,7 +53,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
 static void fscache_object_slow_work_put_ref(struct slow_work *);
 static int  fscache_object_slow_work_get_ref(struct slow_work *);
 static void fscache_object_slow_work_execute(struct slow_work *);
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
 #endif
 static void fscache_initialise_object(struct fscache_object *);
@@ -69,7 +69,7 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
        .get_ref        = fscache_object_slow_work_get_ref,
        .put_ref        = fscache_object_slow_work_put_ref,
        .execute        = fscache_object_slow_work_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        .desc           = fscache_object_slow_work_desc,
 #endif
 };
@@ -364,7 +364,7 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
 /*
 * describe an object for slow-work debugging
 */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_object_slow_work_desc(struct slow_work *work,
                                          struct seq_file *m)
 {
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 313e79a14266..f17cecafae44 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -14,6 +14,7 @@
 #define FSCACHE_DEBUG_LEVEL OPERATION
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "internal.h"
 atomic_t fscache_op_debug_id;
@@ -500,7 +501,7 @@ static void fscache_op_execute(struct slow_work *work)
 /*
 * describe an operation for slow-work debugging
 */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
 {
        struct fscache_operation *op =
@@ -517,7 +518,7 @@ const struct slow_work_ops fscache_op_slow_work_ops = {
        .get_ref        = fscache_op_get_ref,
        .put_ref        = fscache_op_put_ref,
        .execute        = fscache_op_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        .desc           = fscache_op_desc,
 #endif
 };
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7d..47aefd376e54 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -14,6 +14,7 @@
 #include <linux/fscache-cache.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "internal.h"
 /*
@@ -881,6 +882,7 @@ submit_failed:
        goto nobufs;
 nobufs_unlock_obj:
+        spin_unlock(&cookie->stores_lock);
        spin_unlock(&object->lock);
 nobufs:
        spin_unlock(&cookie->lock);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 46435f3aae68..4765190d537f 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -165,8 +165,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
                   atomic_read(&fscache_n_object_lookups),
                   atomic_read(&fscache_n_object_lookups_negative),
                   atomic_read(&fscache_n_object_lookups_positive),
-                   atomic_read(&fscache_n_object_lookups_timed_out),
+                   atomic_read(&fscache_n_object_created),
-                   atomic_read(&fscache_n_object_created));
+                   atomic_read(&fscache_n_object_lookups_timed_out));
        seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
                   atomic_read(&fscache_n_updates),
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index de792dcf3274..e1f8171278bd 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -44,6 +44,7 @@
 #include <linux/magic.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/stat.h>
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1a822ce2b24b..ec14d19ce501 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -850,7 +850,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        req->in.args[0].size = sizeof(*arg);
        req->in.args[0].value = arg;
        req->out.numargs = 1;
-        /* Variable length arguement used for backward compatibility
+        /* Variable length argument used for backward compatibility
           with interface version < 7.5.  Rest of init_out is zeroed
           by do_get_request(), so a short reply is not a problem */
        req->out.argvar = 1;
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 55458031e501..fe5df5457656 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -7,6 +7,7 @@
 */
 #include <linux/sched.h>
+#include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/generic_acl.h>
 #include <linux/posix_acl.h>
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 4dcddf83326f..a47b43107112 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,7 +8,6 @@ config GFS2_FS
        select FS_POSIX_ACL
        select CRC32
        select SLOW_WORK
-        select QUOTA
        select QUOTACTL
        help
          A cluster filesystem.
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0c1d0b82dcf1..a739a0a48067 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -418,6 +418,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
 static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 {
        struct buffer_head *dibh;
+        u64 dsize = i_size_read(&ip->i_inode);
        void *kaddr;
        int error;
@@ -437,9 +438,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
                return error;
        kaddr = kmap_atomic(page, KM_USER0);
-        memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+        if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
-               ip->i_disksize);
+                dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
-        memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
+        memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
+        memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
        kunmap_atomic(kaddr, KM_USER0);
        flush_dcache_page(page);
        brelse(dibh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 583e823307ae..4a48c0f4b402 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -72,11 +71,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (!PageUptodate(page)) {
                void *kaddr = kmap(page);
+                u64 dsize = i_size_read(inode);
+ 
+                if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
+                        dsize = dibh->b_size - sizeof(struct gfs2_dinode);
-                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
-                       ip->i_disksize);
+                memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
-                memset(kaddr + ip->i_disksize, 0,
-                       PAGE_CACHE_SIZE - ip->i_disksize);
                kunmap(page);
                SetPageUptodate(page);
@@ -1039,13 +1040,14 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
                goto out;
        if (gfs2_is_stuffed(ip)) {
-                ip->i_disksize = size;
+                u64 dsize = size + sizeof(struct gfs2_inode);
                ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
-                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
+                if (dsize > dibh->b_size)
+                        dsize = dibh->b_size;
+                gfs2_buffer_clear_tail(dibh, dsize);
                error = 1;
        } else {
                if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
                        error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 91beddadd388..bb7907bde3d8 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 25fddc100f18..8295c5b5d4a9 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1475,7 +1475,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
                inode = gfs2_inode_lookup(dir->i_sb, 
                                be16_to_cpu(dent->de_type),
                                be64_to_cpu(dent->de_inum.no_addr),
-                                be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+                                be64_to_cpu(dent->de_inum.no_formal_ino));
                brelse(bh);
                return inode;
        }
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d15876e9aa26..dfe237a3f8ad 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -169,7 +168,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
        if (error)
                goto fail;
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto fail;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a6abbae8a278..e6dd2aec6f82 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -640,7 +640,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
        if (!(fl->fl_flags & FL_POSIX))
                return -ENOLCK;
-        if (__mandatory_lock(&ip->i_inode))
+        if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
                return -ENOLCK;
        if (cmd == F_CANCELLK) {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 454d4b4eb36b..ddcdbf493536 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -855,6 +855,9 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
        gh->gh_flags = flags;
        gh->gh_iflags = 0;
        gh->gh_ip = (unsigned long)__builtin_return_address(0);
+        if (gh->gh_owner_pid)
+                put_pid(gh->gh_owner_pid);
+        gh->gh_owner_pid = get_pid(task_pid(current));
 }
 /**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 38e3749d476c..49f97d3bb690 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b8025e51cabf..b5d7363b22da 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -439,9 +439,6 @@ struct gfs2_args {
 struct gfs2_tune {
        spinlock_t gt_spin;
-        unsigned int gt_incore_log_blocks;
-        unsigned int gt_log_flush_secs;
        unsigned int gt_logd_secs;
        unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -462,6 +459,7 @@ enum {
        SDF_SHUTDOWN            = 2,
        SDF_NOBARRIERS          = 3,
        SDF_NORECOVERY          = 4,
+        SDF_DEMOTE              = 5,
 };
 #define GFS2_FSNAME_LEN         256
@@ -616,8 +614,9 @@ struct gfs2_sbd {
        unsigned int sd_log_blks_reserved;
        unsigned int sd_log_commited_buf;
        unsigned int sd_log_commited_databuf;
-        unsigned int sd_log_commited_revoke;
+        int sd_log_commited_revoke;
+        atomic_t sd_log_pinned;
        unsigned int sd_log_num_buf;
        unsigned int sd_log_num_revoke;
        unsigned int sd_log_num_rg;
@@ -629,15 +628,17 @@ struct gfs2_sbd {
        struct list_head sd_log_le_databuf;
        struct list_head sd_log_le_ordered;
+        atomic_t sd_log_thresh1;
+        atomic_t sd_log_thresh2;
        atomic_t sd_log_blks_free;
-        struct mutex sd_log_reserve_mutex;
+        wait_queue_head_t sd_log_waitq;
+        wait_queue_head_t sd_logd_waitq;
        u64 sd_log_sequence;
        unsigned int sd_log_head;
        unsigned int sd_log_tail;
        int sd_log_idle;
-        unsigned long sd_log_flush_time;
        struct rw_semaphore sd_log_flush_lock;
        atomic_t sd_log_in_flight;
        wait_queue_head_t sd_log_flush_wait;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b1bf2694fb2b..51d8061fa07a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -158,7 +158,6 @@ void gfs2_set_iop(struct inode *inode)
 * @sb: The super block
 * @no_addr: The inode number
 * @type: The type of the inode
- * @skip_freeing: set this not return an inode if it is currently being freed.
 *
 * Returns: A VFS inode, or an error
 */
@@ -166,17 +165,14 @@ void gfs2_set_iop(struct inode *inode)
 struct inode *gfs2_inode_lookup(struct super_block *sb,
                                unsigned int type,
                                u64 no_addr,
-                                u64 no_formal_ino, int skip_freeing)
+                                u64 no_formal_ino)
 {
        struct inode *inode;
        struct gfs2_inode *ip;
        struct gfs2_glock *io_gl;
        int error;
-        if (skip_freeing)
+        inode = gfs2_iget(sb, no_addr);
-                inode = gfs2_iget_skip(sb, no_addr);
-        else
-                inode = gfs2_iget(sb, no_addr);
        ip = GFS2_I(inode);
        if (!inode)
@@ -234,13 +230,100 @@ fail_glock:
 fail_iopen:
        gfs2_glock_put(io_gl);
 fail_put:
-        ip->i_gl->gl_object = NULL;
+        if (inode->i_state & I_NEW)
+                ip->i_gl->gl_object = NULL;
        gfs2_glock_put(ip->i_gl);
 fail:
-        iget_failed(inode);
+        if (inode->i_state & I_NEW)
+                iget_failed(inode);
+        else
+                iput(inode);
        return ERR_PTR(error);
 }
+/**
+ * gfs2_unlinked_inode_lookup - Lookup an unlinked inode for reclamation
+ * @sb: The super block
+ * no_addr: The inode number
+ * @@inode: A pointer to the inode found, if any
+ *
+ * Returns: 0 and *inode if no errors occurred.  If an error occurs,
+ *          the resulting *inode may or may not be NULL.
+ */
+int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
+                               struct inode **inode)
+{
+        struct gfs2_sbd *sdp;
+        struct gfs2_inode *ip;
+        struct gfs2_glock *io_gl;
+        int error;
+        struct gfs2_holder gh;
+        *inode = gfs2_iget_skip(sb, no_addr);
+        if (!(*inode))
+                return -ENOBUFS;
+        if (!((*inode)->i_state & I_NEW))
+                return -ENOBUFS;
+        ip = GFS2_I(*inode);
+        sdp = GFS2_SB(*inode);
+        ip->i_no_formal_ino = -1;
+        error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+        if (unlikely(error))
+                goto fail;
+        ip->i_gl->gl_object = ip;
+        error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+        if (unlikely(error))
+                goto fail_put;
+        set_bit(GIF_INVALID, &ip->i_flags);
+        error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
+                                   &ip->i_iopen_gh);
+        if (unlikely(error)) {
+                if (error == GLR_TRYFAILED)
+                        error = 0;
+                goto fail_iopen;
+        }
+        ip->i_iopen_gh.gh_gl->gl_object = ip;
+        gfs2_glock_put(io_gl);
+        (*inode)->i_mode = DT2IF(DT_UNKNOWN);
+        /*
+         * We must read the inode in order to work out its type in
+         * this case. Note that this doesn't happen often as we normally
+         * know the type beforehand. This code path only occurs during
+         * unlinked inode recovery (where it is safe to do this glock,
+         * which is not true in the general case).
+         */
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
+                                   &gh);
+        if (unlikely(error)) {
+                if (error == GLR_TRYFAILED)
+                        error = 0;
+                goto fail_glock;
+        }
+        /* Inode is now uptodate */
+        gfs2_glock_dq_uninit(&gh);
+        gfs2_set_iop(*inode);
+        return 0;
+fail_glock:
+        gfs2_glock_dq(&ip->i_iopen_gh);
+fail_iopen:
+        gfs2_glock_put(io_gl);
+fail_put:
+        ip->i_gl->gl_object = NULL;
+        gfs2_glock_put(ip->i_gl);
+fail:
+        return error;
+}
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
        const struct gfs2_dinode *str = buf;
@@ -862,7 +945,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
                goto fail_gunlock2;
        inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
-                                  inum.no_formal_ino, 0);
+                                  inum.no_formal_ino);
        if (IS_ERR(inode))
                goto fail_gunlock2;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c341aaf67adb..e161461d4c57 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -83,8 +83,9 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-                                       u64 no_addr, u64 no_formal_ino,
+                                       u64 no_addr, u64 no_formal_ino);
-                                       int skip_freeing);
+extern int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
+                                      struct inode **inode);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 569b46240f61..0e0470ed34c2 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -9,6 +9,7 @@
 #include <linux/fs.h>
 #include <linux/dlm.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/gfs2_ondisk.h>
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4511b08fc451..b593f0e28f25 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -168,12 +168,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
        return list_empty(&ai->ai_ail1_list);
 }
-static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
+static void gfs2_ail1_start(struct gfs2_sbd *sdp)
 {
        struct list_head *head;
        u64 sync_gen;
-        struct list_head *first;
+        struct gfs2_ail *ai;
-        struct gfs2_ail *first_ai, *ai, *tmp;
        int done = 0;
        gfs2_log_lock(sdp);
@@ -184,21 +183,9 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
        }
        sync_gen = sdp->sd_ail_sync_gen++;
-        first = head->prev;
-        first_ai = list_entry(first, struct gfs2_ail, ai_list);
-        first_ai->ai_sync_gen = sync_gen;
-        gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
-        if (flags & DIO_ALL)
-                first = NULL;
        while(!done) {
-                if (first && (head->prev != first ||
-                              gfs2_ail1_empty_one(sdp, first_ai, 0)))
-                        break;
                done = 1;
-                list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) {
+                list_for_each_entry_reverse(ai, head, ai_list) {
                        if (ai->ai_sync_gen >= sync_gen)
                                continue;
                        ai->ai_sync_gen = sync_gen;
@@ -290,58 +277,57 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
 * flush time, so we ensure that we have just enough free blocks at all
 * times to avoid running out during a log flush.
 *
+ * We no longer flush the log here, instead we wake up logd to do that
+ * for us. To avoid the thundering herd and to ensure that we deal fairly
+ * with queued waiters, we use an exclusive wait. This means that when we
+ * get woken with enough journal space to get our reservation, we need to
+ * wake the next waiter on the list.
+ *
 * Returns: errno
 */
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 {
-        unsigned int try = 0;
        unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+        unsigned wanted = blks + reserved_blks;
+        DEFINE_WAIT(wait);
+        int did_wait = 0;
+        unsigned int free_blocks;
        if (gfs2_assert_warn(sdp, blks) ||
            gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
                return -EINVAL;
+retry:
-        mutex_lock(&sdp->sd_log_reserve_mutex);
+        free_blocks = atomic_read(&sdp->sd_log_blks_free);
-        gfs2_log_lock(sdp);
+        if (unlikely(free_blocks <= wanted)) {
-        while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) {
+                do {
-                gfs2_log_unlock(sdp);
+                        prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
-                gfs2_ail1_empty(sdp, 0);
+                                        TASK_UNINTERRUPTIBLE);
-                gfs2_log_flush(sdp, NULL);
+                        wake_up(&sdp->sd_logd_waitq);
+                        did_wait = 1;
-                if (try++)
+                        if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
-                        gfs2_ail1_start(sdp, 0);
+                                io_schedule();
-                gfs2_log_lock(sdp);
+                        free_blocks = atomic_read(&sdp->sd_log_blks_free);
+                } while(free_blocks <= wanted);
+                finish_wait(&sdp->sd_log_waitq, &wait);
        }
-        atomic_sub(blks, &sdp->sd_log_blks_free);
+        if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
+                                free_blocks - blks) != free_blocks)
+                goto retry;
        trace_gfs2_log_blocks(sdp, -blks);
-        gfs2_log_unlock(sdp);
-        mutex_unlock(&sdp->sd_log_reserve_mutex);
+        /*
+         * If we waited, then so might others, wake them up _after_ we get
+         * our share of the log.
+         */
+        if (unlikely(did_wait))
+                wake_up(&sdp->sd_log_waitq);
        down_read(&sdp->sd_log_flush_lock);
        return 0;
 }
-/**
- * gfs2_log_release - Release a given number of log blocks
- * @sdp: The GFS2 superblock
- * @blks: The number of blocks
- *
- */
-void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
-{
-        gfs2_log_lock(sdp);
-        atomic_add(blks, &sdp->sd_log_blks_free);
-        trace_gfs2_log_blocks(sdp, blks);
-        gfs2_assert_withdraw(sdp,
-                             atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
-        gfs2_log_unlock(sdp);
-        up_read(&sdp->sd_log_flush_lock);
-}
 static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
 {
        struct gfs2_journal_extent *je;
@@ -417,7 +403,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp)
        databufhdrs_needed = (sdp->sd_log_commited_databuf +
                              (dbuf_limit - 1)) / dbuf_limit;
-        if (sdp->sd_log_commited_revoke)
+        if (sdp->sd_log_commited_revoke > 0)
                revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
                                          sizeof(u64));
@@ -559,11 +545,10 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
        ail2_empty(sdp, new_tail);
-        gfs2_log_lock(sdp);
        atomic_add(dist, &sdp->sd_log_blks_free);
        trace_gfs2_log_blocks(sdp, dist);
-        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
+        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
-        gfs2_log_unlock(sdp);
+                             sdp->sd_jdesc->jd_blocks);
        sdp->sd_log_tail = new_tail;
 }
@@ -615,6 +600,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        if (buffer_eopnotsupp(bh)) {
                clear_buffer_eopnotsupp(bh);
                set_buffer_uptodate(bh);
+                fs_info(sdp, "barrier sync failed - disabling barriers\n");
                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
                lock_buffer(bh);
 skip_barrier:
@@ -790,7 +776,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
                             (((int)sdp->sd_log_commited_databuf) >= 0));
        sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
-        gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
        reserved = calc_reserved(sdp);
        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
        unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
@@ -823,6 +808,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 * @sdp: the filesystem
 * @tr: the transaction
 *
+ * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
+ * or the total number of used blocks (pinned blocks plus AIL blocks)
+ * is greater than thresh2.
+ *
+ * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
+ * journal size.
+ *
 * Returns: errno
 */
@@ -833,10 +825,10 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        up_read(&sdp->sd_log_flush_lock);
-        gfs2_log_lock(sdp);
+        if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
-        if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks))
+            ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
-                wake_up_process(sdp->sd_logd_process);
+            atomic_read(&sdp->sd_log_thresh2)))
-        gfs2_log_unlock(sdp);
+                wake_up(&sdp->sd_logd_waitq);
 }
 /**
@@ -883,13 +875,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
 {
        gfs2_log_flush(sdp, NULL);
        for (;;) {
-                gfs2_ail1_start(sdp, DIO_ALL);
+                gfs2_ail1_start(sdp);
                if (gfs2_ail1_empty(sdp, DIO_ALL))
                        break;
                msleep(10);
        }
 }
+static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
+{
+        return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
+}
+static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
+{
+        unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
+        return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
+}
 /**
 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
@@ -902,28 +904,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
 int gfs2_logd(void *data)
 {
        struct gfs2_sbd *sdp = data;
-        unsigned long t;
+        unsigned long t = 1;
-        int need_flush;
+        DEFINE_WAIT(wait);
+        unsigned preflush;
        while (!kthread_should_stop()) {
-                /* Advance the log tail */
-                t = sdp->sd_log_flush_time +
+                preflush = atomic_read(&sdp->sd_log_pinned);
-                    gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+                if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
+                        gfs2_ail1_empty(sdp, DIO_ALL);
+                        gfs2_log_flush(sdp, NULL);
+                        gfs2_ail1_empty(sdp, DIO_ALL);
+                }
-                gfs2_ail1_empty(sdp, DIO_ALL);
+                if (gfs2_ail_flush_reqd(sdp)) {
-                gfs2_log_lock(sdp);
+                        gfs2_ail1_start(sdp);
-                need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
+                        io_schedule();
-                gfs2_log_unlock(sdp);
+                        gfs2_ail1_empty(sdp, 0);
-                if (need_flush || time_after_eq(jiffies, t)) {
                        gfs2_log_flush(sdp, NULL);
-                        sdp->sd_log_flush_time = jiffies;
+                        gfs2_ail1_empty(sdp, DIO_ALL);
                }
+                wake_up(&sdp->sd_log_waitq);
                t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
                if (freezing(current))
                        refrigerator();
-                schedule_timeout_interruptible(t);
+                do {
+                        prepare_to_wait(&sdp->sd_logd_waitq, &wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        if (!gfs2_ail_flush_reqd(sdp) &&
+                            !gfs2_jrnl_flush_reqd(sdp) &&
+                            !kthread_should_stop())
+                                t = schedule_timeout(t);
+                } while(t && !gfs2_ail_flush_reqd(sdp) &&
+                        !gfs2_jrnl_flush_reqd(sdp) &&
+                        !kthread_should_stop());
+                finish_wait(&sdp->sd_logd_waitq, &wait);
        }
        return 0;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 7c64510ccfd2..eb570b4ad443 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -51,7 +51,6 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
                            unsigned int ssize);
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
-void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
 void gfs2_log_incr_head(struct gfs2_sbd *sdp);
 struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index adc260fbea90..bf33f822058d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
        if (bd->bd_ail)
                list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
        get_bh(bh);
+        atomic_inc(&sdp->sd_log_pinned);
        trace_gfs2_pin(bd, 1);
 }
@@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        trace_gfs2_pin(bd, 0);
        gfs2_log_unlock(sdp);
        unlock_buffer(bh);
+        atomic_dec(&sdp->sd_log_pinned);
 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a88fadc704bb..fb2a5f93b7c3 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -94,7 +94,7 @@ static int __init init_gfs2_fs(void)
        if (!gfs2_glock_cachep)
                goto fail;
-        gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)",
+        gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
                                        sizeof(struct gfs2_glock) +
                                        sizeof(struct address_space),
                                        0, 0, gfs2_init_gl_aspace_once);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0bb12c80937a..18176d0b75d7 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -34,7 +34,6 @@
 static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
 {
-        int err;
        struct buffer_head *bh, *head;
        int nr_underway = 0;
        int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
@@ -86,11 +85,10 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
        } while (bh != head);
        unlock_page(page);
-        err = 0;
        if (nr_underway == 0)
                end_page_writeback(page);
-        return err;
+        return 0;
 }
 const struct address_space_operations gfs2_meta_aops = {
@@ -313,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
        struct gfs2_bufdata *bd = bh->b_private;
        if (test_clear_buffer_pinned(bh)) {
+                atomic_dec(&sdp->sd_log_pinned);
                list_del_init(&bd->bd_le.le_list);
                if (meta) {
                        gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a054b526dc08..3593b3a7290e 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -57,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 {
        spin_lock_init(&gt->gt_spin);
-        gt->gt_incore_log_blocks = 1024;
-        gt->gt_logd_secs = 1;
        gt->gt_quota_simul_sync = 64;
        gt->gt_quota_warn_period = 10;
        gt->gt_quota_scale_num = 1;
@@ -101,14 +99,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        spin_lock_init(&sdp->sd_trunc_lock);
        spin_lock_init(&sdp->sd_log_lock);
+        atomic_set(&sdp->sd_log_pinned, 0);
        INIT_LIST_HEAD(&sdp->sd_log_le_buf);
        INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
        INIT_LIST_HEAD(&sdp->sd_log_le_rg);
        INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
        INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
-        mutex_init(&sdp->sd_log_reserve_mutex);
+        init_waitqueue_head(&sdp->sd_log_waitq);
+        init_waitqueue_head(&sdp->sd_logd_waitq);
        INIT_LIST_HEAD(&sdp->sd_ail1_list);
        INIT_LIST_HEAD(&sdp->sd_ail2_list);
@@ -487,7 +486,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
        struct dentry *dentry;
        struct inode *inode;
-        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
        if (IS_ERR(inode)) {
                fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
                return PTR_ERR(inode);
@@ -733,6 +732,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
        if (sdp->sd_args.ar_spectator) {
                sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
                atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+                atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+                atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
        } else {
                if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
                        fs_err(sdp, "can't mount journal #%u\n",
@@ -770,6 +771,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                        goto fail_jinode_gh;
                }
                atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+                atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+                atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
                /* Map the extents for this journal's blocks */
                map_journal_extents(sdp);
@@ -951,8 +954,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
        if (undo)
                goto fail_quotad;
-        sdp->sd_log_flush_time = jiffies;
        p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
        error = IS_ERR(p);
        if (error) {
@@ -1001,7 +1002,7 @@ static const struct lm_lockops nolock_ops = {
 /**
 * gfs2_lm_mount - mount a locking protocol
 * @sdp: the filesystem
- * @args: mount arguements
+ * @args: mount arguments
 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
 *
 * Returns: errno
@@ -1160,7 +1161,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
                               GFS2_BASIC_BLOCK_SHIFT;
        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
-        sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+        sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
        sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
        if (sdp->sd_args.ar_statfs_quantum) {
                sdp->sd_tune.gt_statfs_slow = 0;
@@ -1323,7 +1324,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        memset(&args, 0, sizeof(args));
        args.ar_quota = GFS2_QUOTA_DEFAULT;
        args.ar_data = GFS2_DATA_DEFAULT;
-        args.ar_commit = 60;
+        args.ar_commit = 30;
        args.ar_statfs_quantum = 30;
        args.ar_quota_quantum = 60;
        args.ar_errors = GFS2_ERRORS_DEFAULT;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6dbcbad6ab17..d5f4661287f9 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -637,15 +637,40 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        unsigned blocksize, iblock, pos;
        struct buffer_head *bh, *dibh;
        struct page *page;
-        void *kaddr;
+        void *kaddr, *ptr;
-        struct gfs2_quota *qp;
+        struct gfs2_quota q, *qp;
-        s64 value;
+        int err, nbytes;
-        int err = -EIO;
        u64 size;
        if (gfs2_is_stuffed(ip))
                gfs2_unstuff_dinode(ip, NULL);
-        
+        memset(&q, 0, sizeof(struct gfs2_quota));
+        err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
+        if (err < 0)
+                return err;
+        err = -EIO;
+        qp = &q;
+        qp->qu_value = be64_to_cpu(qp->qu_value);
+        qp->qu_value += change;
+        qp->qu_value = cpu_to_be64(qp->qu_value);
+        qd->qd_qb.qb_value = qp->qu_value;
+        if (fdq) {
+                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+                        qd->qd_qb.qb_warn = qp->qu_warn;
+                }
+                if (fdq->d_fieldmask & FS_DQ_BHARD) {
+                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+                        qd->qd_qb.qb_limit = qp->qu_limit;
+                }
+        }
+        /* Write the quota into the quota file on disk */
+        ptr = qp;
+        nbytes = sizeof(struct gfs2_quota);
+get_a_page:
        page = grab_cache_page(mapping, index);
        if (!page)
                return -ENOMEM;
@@ -667,7 +692,12 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        if (!buffer_mapped(bh)) {
                gfs2_block_map(inode, iblock, bh, 1);
                if (!buffer_mapped(bh))
-                        goto unlock;
+                        goto unlock_out;
+                /* If it's a newly allocated disk block for quota, zero it */
+                if (buffer_new(bh)) {
+                        memset(bh->b_data, 0, bh->b_size);
+                        set_buffer_uptodate(bh);
+                }
        }
        if (PageUptodate(page))
@@ -677,32 +707,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                ll_rw_block(READ_META, 1, &bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
-                        goto unlock;
+                        goto unlock_out;
        }
        gfs2_trans_add_bh(ip->i_gl, bh, 0);
        kaddr = kmap_atomic(page, KM_USER0);
-        qp = kaddr + offset;
+        if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
-        value = (s64)be64_to_cpu(qp->qu_value) + change;
+                nbytes = PAGE_CACHE_SIZE - offset;
-        qp->qu_value = cpu_to_be64(value);
+        memcpy(kaddr + offset, ptr, nbytes);
-        qd->qd_qb.qb_value = qp->qu_value;
-        if (fdq) {
-                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
-                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
-                        qd->qd_qb.qb_warn = qp->qu_warn;
-                }
-                if (fdq->d_fieldmask & FS_DQ_BHARD) {
-                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
-                        qd->qd_qb.qb_limit = qp->qu_limit;
-                }
-        }
        flush_dcache_page(page);
        kunmap_atomic(kaddr, KM_USER0);
+        unlock_page(page);
+        page_cache_release(page);
+        /* If quota straddles page boundary, we need to update the rest of the
+         * quota at the beginning of the next page */
+        if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */
+                ptr = ptr + nbytes;
+                nbytes = sizeof(struct gfs2_quota) - nbytes;
+                offset = 0;
+                index++;
+                goto get_a_page;
+        }
+        /* Update the disk inode timestamp and size (if extended) */
        err = gfs2_meta_inode_buffer(ip, &dibh);
        if (err)
-                goto unlock;
+                goto out;
        size = loc + sizeof(struct gfs2_quota);
        if (size > inode->i_size) {
@@ -715,7 +747,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        brelse(dibh);
        mark_inode_dirty(inode);
-unlock:
+out:
+        return err;
+unlock_out:
        unlock_page(page);
        page_cache_release(page);
        return err;
@@ -779,8 +813,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
         * rgrp since it won't be allocated during the transaction
         */
        al->al_requested = 1;
-        /* +1 in the end for block requested above for unstuffing */
+        /* +3 in the end for unstuffing block, inode size update block
-        blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1;
+         * and another block in case quota straddles page boundary and 
+         * two blocks need to be updated instead of 1 */
+        blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
        if (nalloc)
                al->al_requested += nalloc * (data_blocks + ind_blocks);                
@@ -1418,10 +1454,18 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
        memset(fqs, 0, sizeof(struct fs_quota_stat));
        fqs->qs_version = FS_QSTAT_VERSION;
-        if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON)
-                fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+        switch (sdp->sd_args.ar_quota) {
-        else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
+        case GFS2_QUOTA_ON:
-                fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+                fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+                /*FALLTHRU*/
+        case GFS2_QUOTA_ACCOUNT:
+                fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+                break;
+        case GFS2_QUOTA_OFF:
+                break;
+        }
        if (sdp->sd_quota_inode) {
                fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
                fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 503b842f3ba2..8bce73ed4d8e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -948,13 +948,13 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
 * @rgd: The rgrp
 *
- * Returns: The inode, if one has been found
+ * Returns: 0 if no error
+ *          The inode, if one has been found, in inode.
 */
-static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
-                                     u64 skip)
+                           u64 skip)
 {
-        struct inode *inode;
        u32 goal = 0, block;
        u64 no_addr;
        struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -979,14 +979,11 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
                if (no_addr == skip)
                        continue;
                *last_unlinked = no_addr;
-                inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
+                return no_addr;
-                                          no_addr, -1, 1);
-                if (!IS_ERR(inode))
-                        return inode;
        }
        rgd->rd_flags &= ~GFS2_RDF_CHECK;
-        return NULL;
+        return 0;
 }
 /**
@@ -1067,11 +1064,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
 * Try to acquire rgrp in way which avoids contending with others.
 *
 * Returns: errno
+ *          unlinked: the block address of an unlinked block to be reclaimed
 */
-static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
+                          u64 *last_unlinked)
 {
-        struct inode *inode = NULL;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd, *begin = NULL;
        struct gfs2_alloc *al = ip->i_alloc;
@@ -1080,6 +1078,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        int loops = 0;
        int error, rg_locked;
+        *unlinked = 0;
        rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
        while (rgd) {
@@ -1096,19 +1095,24 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        if (rgd->rd_flags & GFS2_RDF_CHECK)
+                        /* If the rg came in already locked, there's no
-                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+                           way we can recover from a failed try_rgrp_unlink
+                           because that would require an iput which can only
+                           happen after the rgrp is unlocked. */
+                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+                                                           ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (inode)
+                        if (*unlinked)
-                                return inode;
+                                return -EAGAIN;
                        /* fall through */
                case GLR_TRYFAILED:
                        rgd = recent_rgrp_next(rgd);
                        break;
                default:
-                        return ERR_PTR(error);
+                        return error;
                }
        }
@@ -1130,12 +1134,13 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        if (rgd->rd_flags & GFS2_RDF_CHECK)
+                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
-                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+                                                            ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (inode)
+                        if (*unlinked)
-                                return inode;
+                                return -EAGAIN;
                        break;
                case GLR_TRYFAILED:
@@ -1143,7 +1148,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                        break;
                default:
-                        return ERR_PTR(error);
+                        return error;
                }
                rgd = gfs2_rgrpd_get_next(rgd);
@@ -1152,7 +1157,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                if (rgd == begin) {
                        if (++loops >= 3)
-                                return ERR_PTR(-ENOSPC);
+                                return -ENOSPC;
                        if (!skipped)
                                loops++;
                        flags = 0;
@@ -1172,7 +1177,7 @@ out:
                forward_rgrp_set(sdp, rgd);
        }
-        return NULL;
+        return 0;
 }
 /**
@@ -1188,7 +1193,7 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
        struct gfs2_alloc *al = ip->i_alloc;
        struct inode *inode;
        int error = 0;
-        u64 last_unlinked = NO_BLOCK;
+        u64 last_unlinked = NO_BLOCK, unlinked;
        if (gfs2_assert_warn(sdp, al->al_requested))
                return -EINVAL;
@@ -1204,14 +1209,19 @@ try_again:
        if (error)
                return error;
-        inode = get_local_rgrp(ip, &last_unlinked);
+        error = get_local_rgrp(ip, &unlinked, &last_unlinked);
-        if (inode) {
+        if (error) {
                if (ip != GFS2_I(sdp->sd_rindex))
                        gfs2_glock_dq_uninit(&al->al_ri_gh);
-                if (IS_ERR(inode))
+                if (error != -EAGAIN)
-                        return PTR_ERR(inode);
+                        return error;
-                iput(inode);
+                error = gfs2_unlinked_inode_lookup(ip->i_inode.i_sb,
+                                                   unlinked, &inode);
+                if (inode)
+                        iput(inode);
                gfs2_log_flush(sdp, NULL);
+                if (error == GLR_TRYFAILED)
+                        error = 0;
                goto try_again;
        }
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4106ddaaa98..f07119d89557 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -10,6 +10,8 @@
 #ifndef __RGRP_DOT_H__
 #define __RGRP_DOT_H__
+#include <linux/slab.h>
 struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 50aac606b990..4d1aad38f1b1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1113,7 +1113,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        int error;
        spin_lock(&gt->gt_spin);
-        args.ar_commit = gt->gt_log_flush_secs;
+        args.ar_commit = gt->gt_logd_secs;
        args.ar_quota_quantum = gt->gt_quota_quantum;
        if (gt->gt_statfs_slow)
                args.ar_statfs_quantum = 0;
@@ -1160,7 +1160,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        else
                clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
        spin_lock(&gt->gt_spin);
-        gt->gt_log_flush_secs = args.ar_commit;
+        gt->gt_logd_secs = args.ar_commit;
        gt->gt_quota_quantum = args.ar_quota_quantum;
        if (args.ar_statfs_quantum) {
                gt->gt_statfs_slow = 0;
@@ -1305,8 +1305,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        }
        if (args->ar_discard)
                seq_printf(s, ",discard");
-        val = sdp->sd_tune.gt_log_flush_secs;
+        val = sdp->sd_tune.gt_logd_secs;
-        if (val != 60)
+        if (val != 30)
                seq_printf(s, ",commit=%d", val);
        val = sdp->sd_tune.gt_statfs_quantum;
        if (val != 30)
@@ -1334,7 +1334,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        }
        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
                seq_printf(s, ",nobarrier");
+        if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
+                seq_printf(s, ",demote_interface_used");
        return 0;
 }
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index b5f1a46133c8..37f5393e68e6 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -8,7 +8,6 @@
 */
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -49,7 +48,7 @@ static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
        return a->store ? a->store(sdp, buf, len) : len;
 }
-static struct sysfs_ops gfs2_attr_ops = {
+static const struct sysfs_ops gfs2_attr_ops = {
        .show  = gfs2_attr_show,
        .store = gfs2_attr_store,
 };
@@ -233,6 +232,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
        glops = gfs2_glops_list[gltype];
        if (glops == NULL)
                return -EINVAL;
+        if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
+                fs_info(sdp, "demote interface used\n");
        rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
        if (rv)
                return rv;
@@ -469,8 +470,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
-TUNE_ATTR(incore_log_blocks, 0);
-TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
 TUNE_ATTR(quota_quantum, 0);
 TUNE_ATTR(max_readahead, 0);
@@ -482,8 +481,6 @@ TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 static struct attribute *tune_attrs[] = {
-        &tune_attr_incore_log_blocks.attr,
-        &tune_attr_log_flush_secs.attr,
        &tune_attr_quota_warn_period.attr,
        &tune_attr_quota_quantum.attr,
        &tune_attr_max_readahead.attr,
@@ -574,7 +571,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
        return 0;
 }
-static struct kset_uevent_ops gfs2_uevent_ops = {
+static const struct kset_uevent_ops gfs2_uevent_ops = {
        .uevent = gfs2_uevent,
 };
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 4ef0e9fa3549..9ec73a854111 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -23,6 +23,7 @@
 #include "meta_io.h"
 #include "trans.h"
 #include "util.h"
+#include "trace_gfs2.h"
 int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
                     unsigned int revokes)
@@ -75,6 +76,23 @@ fail_holder_uninit:
        return error;
 }
+/**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+        atomic_add(blks, &sdp->sd_log_blks_free);
+        trace_gfs2_log_blocks(sdp, blks);
+        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+                                  sdp->sd_jdesc->jd_blocks);
+        up_read(&sdp->sd_log_flush_lock);
+}
 void gfs2_trans_end(struct gfs2_sbd *sdp)
 {
        struct gfs2_trans *tr = current->journal_info;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 226f2bfbf16a..53511291fe36 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 0d200068d0af..cdb41a1f6a64 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -9,6 +9,7 @@
 */
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include "btree.h"
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 052f214ea6f0..38a0a9917d7f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -9,6 +9,7 @@
 */
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/log2.h>
 #include "btree.h"
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 8bbe03c3f6d5..86428f5ac991 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -11,6 +11,7 @@
 #include <linux/cdrom.h>
 #include <linux/genhd.h>
 #include <linux/nls.h>
+#include <linux/slab.h>
 #include "hfs_fs.h"
 #include "btree.h"
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 5ed7252b7b23..0a81eb7111f3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
 #include <linux/nls.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/vfs.h>
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 3fcbb0e1f6fc..572628b4b07d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -15,6 +15,7 @@
 #include <linux/nls.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "hfsplus_fs.h"
 enum {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 032604e5ef2c..3a029d8f4cf1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -11,6 +11,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/statfs.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "hostfs.h"
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index b6fca543544c..eac5f96323e3 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -6,6 +6,7 @@
 *  general buffer i/o
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 void hpfs_lock_creation(struct super_block *s)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 26e3964a4b8c..2338130cceba 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -7,6 +7,7 @@
 */
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 static int hpfs_dir_release(struct inode *inode, struct file *filp)
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index ff90affb94e1..1042a9bc97f3 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -7,6 +7,7 @@
 */
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 void hpfs_init_inode(struct inode *i)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index cadc4ce48656..aa53842c599c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
 #include <linux/bitmap.h>
+#include <linux/slab.h>
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
diff --git a/fs/inode.c b/fs/inode.c
index 407bf392e20a..258ec22bb298 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1205,8 +1205,6 @@ void generic_delete_inode(struct inode *inode)
        inodes_stat.nr_inodes--;
        spin_unlock(&inode_lock);
-        security_inode_delete(inode);
        if (op->delete_inode) {
                void (*delete)(struct inode *) = op->delete_inode;
                /* Filesystems implementing their own
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 6c751106c2e5..7faefb4da939 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -228,14 +228,23 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 #ifdef CONFIG_BLOCK
-#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
+static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
-#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
+{
+        return (offset >> inode->i_blkbits);
+}
+static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
+{
+        return (blk << inode->i_blkbits);
+}
 /**
 * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
- * @inode - the inode to map
+ * @inode: the inode to map
- * @arg - the pointer to userspace where we copy everything to
+ * @fieinfo: the fiemap info struct that will be passed back to userspace
- * @get_block - the fs's get_block function
+ * @start: where to start mapping in the inode
+ * @len: how much space to map
+ * @get_block: the fs's get_block function
 *
 * This does FIEMAP for block based inodes.  Basically it will just loop
 * through get_block until we hit the number of extents we want to map, or we
@@ -250,58 +259,63 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 */
 int __generic_block_fiemap(struct inode *inode,
-                           struct fiemap_extent_info *fieinfo, u64 start,
+                           struct fiemap_extent_info *fieinfo, loff_t start,
-                           u64 len, get_block_t *get_block)
+                           loff_t len, get_block_t *get_block)
 {
-        struct buffer_head tmp;
+        struct buffer_head map_bh;
-        unsigned long long start_blk;
+        sector_t start_blk, last_blk;
-        long long length = 0, map_len = 0;
+        loff_t isize = i_size_read(inode);
        u64 logical = 0, phys = 0, size = 0;
        u32 flags = FIEMAP_EXTENT_MERGED;
-        int ret = 0, past_eof = 0, whole_file = 0;
+        bool past_eof = false, whole_file = false;
+        int ret = 0;
-        if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
+        ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+        if (ret)
                return ret;
-        start_blk = logical_to_blk(inode, start);
+        /*
+         * Either the i_mutex or other appropriate locking needs to be held
-        length = (long long)min_t(u64, len, i_size_read(inode));
+         * since we expect isize to not change at all through the duration of
-        if (length < len)
+         * this call.
-                whole_file = 1;
+         */
+        if (len >= isize) {
+                whole_file = true;
+                len = isize;
+        }
-        map_len = length;
+        start_blk = logical_to_blk(inode, start);
+        last_blk = logical_to_blk(inode, start + len - 1);
        do {
                /*
                 * we set b_size to the total size we want so it will map as
                 * many contiguous blocks as possible at once
                 */
-                memset(&tmp, 0, sizeof(struct buffer_head));
+                memset(&map_bh, 0, sizeof(struct buffer_head));
-                tmp.b_size = map_len;
+                map_bh.b_size = len;
-                ret = get_block(inode, start_blk, &tmp, 0);
+                ret = get_block(inode, start_blk, &map_bh, 0);
                if (ret)
                        break;
                /* HOLE */
-                if (!buffer_mapped(&tmp)) {
+                if (!buffer_mapped(&map_bh)) {
-                        length -= blk_to_logical(inode, 1);
                        start_blk++;
                        /*
-                         * we want to handle the case where there is an
+                         * We want to handle the case where there is an
                         * allocated block at the front of the file, and then
                         * nothing but holes up to the end of the file properly,
                         * to make sure that extent at the front gets properly
                         * marked with FIEMAP_EXTENT_LAST
                         */
                        if (!past_eof &&
-                            blk_to_logical(inode, start_blk) >=
+                            blk_to_logical(inode, start_blk) >= isize)
-                            blk_to_logical(inode, 0)+i_size_read(inode))
                                past_eof = 1;
                        /*
-                         * first hole after going past the EOF, this is our
+                         * First hole after going past the EOF, this is our
                         * last extent
                         */
                        if (past_eof && size) {
@@ -309,15 +323,18 @@ int __generic_block_fiemap(struct inode *inode,
                                ret = fiemap_fill_next_extent(fieinfo, logical,
                                                              phys, size,
                                                              flags);
-                                break;
+                        } else if (size) {
+                                ret = fiemap_fill_next_extent(fieinfo, logical,
+                                                              phys, size, flags);
+                                size = 0;
                        }
                        /* if we have holes up to/past EOF then we're done */
-                        if (length <= 0 || past_eof)
+                        if (start_blk > last_blk || past_eof || ret)
                                break;
                } else {
                        /*
-                         * we have gone over the length of what we wanted to
+                         * We have gone over the length of what we wanted to
                         * map, and it wasn't the entire file, so add the extent
                         * we got last time and exit.
                         *
@@ -331,7 +348,7 @@ int __generic_block_fiemap(struct inode *inode,
                         * are good to go, just add the extent to the fieinfo
                         * and break
                         */
-                        if (length <= 0 && !whole_file) {
+                        if (start_blk > last_blk && !whole_file) {
                                ret = fiemap_fill_next_extent(fieinfo, logical,
                                                              phys, size,
                                                              flags);
@@ -351,11 +368,10 @@ int __generic_block_fiemap(struct inode *inode,
                        }
                        logical = blk_to_logical(inode, start_blk);
-                        phys = blk_to_logical(inode, tmp.b_blocknr);
+                        phys = blk_to_logical(inode, map_bh.b_blocknr);
-                        size = tmp.b_size;
+                        size = map_bh.b_size;
                        flags = FIEMAP_EXTENT_MERGED;
-                        length -= tmp.b_size;
                        start_blk += logical_to_blk(inode, size);
                        /*
@@ -363,15 +379,13 @@ int __generic_block_fiemap(struct inode *inode,
                         * soon as we find a hole that the last extent we found
                         * is marked with FIEMAP_EXTENT_LAST
                         */
-                        if (!past_eof &&
+                        if (!past_eof && logical + size >= isize)
-                            logical+size >=
+                                past_eof = true;
-                            blk_to_logical(inode, 0)+i_size_read(inode))
-                                past_eof = 1;
                }
                cond_resched();
        } while (1);
-        /* if ret is 1 then we just hit the end of the extent array */
+        /* If ret is 1 then we just hit the end of the extent array */
        if (ret == 1)
                ret = 0;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c7c0b28d7d21..748cfb92dcc6 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -19,6 +19,7 @@
 * See also Documentation/block/ioprio.txt
 *
 */
+#include <linux/gfp.h>
 #include <linux/kernel.h>
 #include <linux/ioprio.h>
 #include <linux/blkdev.h>
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 8ba5441063be..b9ab69b3a482 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -11,6 +11,7 @@
 *  isofs directory handling functions
 */
 #include <linux/smp_lock.h>
+#include <linux/gfp.h>
 #include "isofs.h"
 int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index eaa831311c9c..ab438beb867c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -7,6 +7,7 @@
 */
 #include <linux/smp_lock.h>
+#include <linux/gfp.h>
 #include "isofs.h"
 /*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 2c90e3ef625f..ecb44c94ba8d 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -17,7 +17,6 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index cb1a49ae605e..54c9bc9e1b17 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,7 +20,6 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #endif
 /*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 99e9fea11077..5ae71e75a491 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1398,7 +1398,7 @@ int journal_stop(handle_t *handle)
         * the case where our storage is so fast that it is more optimal to go
         * ahead and force a flush and wait for the transaction to be committed
         * than it is to wait for an arbitrary amount of time for new writers to
-         * join the transaction.  We acheive this by measuring how long it takes
+         * join the transaction.  We achieve this by measuring how long it takes
         * to commit a transaction, and compare it with how long this
         * transaction has been running, and if run time < commit time then we
         * sleep for the delta and commit.  This greatly helps super fast disks
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c03d4dce4d76..bc2ff5932769 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1889,7 +1889,7 @@ static struct kmem_cache *get_slab(size_t size)
        BUG_ON(i >= JBD2_MAX_SLABS);
        if (unlikely(i < 0))
                i = 0;
-        BUG_ON(jbd2_slab[i] == 0);
+        BUG_ON(jbd2_slab[i] == NULL);
        return jbd2_slab[i];
 }
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 73063285b13f..049281b7cb89 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -20,7 +20,6 @@
 #include <linux/fs.h>
 #include <linux/jbd2.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/crc32.h>
 #endif
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3ff50da94789..55f1dde2fa8b 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -23,10 +23,9 @@ static int jffs2_garbage_collect_thread(void *);
 void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
 {
-        spin_lock(&c->erase_completion_lock);
+        assert_spin_locked(&c->erase_completion_lock);
        if (c->gc_task && jffs2_thread_should_wake(c))
                send_sig(SIGHUP, c->gc_task, 1);
-        spin_unlock(&c->erase_completion_lock);
 }
 /* This must only ever be called when no GC thread is currently running */
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 90cb60d09787..cd02acafde8a 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
 #include <linux/lzo.h>
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index cfd301a5edfc..b46661a42758 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -14,7 +14,6 @@
 #endif
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/zlib.h>
 #include <linux/zutil.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 5544d31c066b..ec3538413926 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -15,6 +15,7 @@
 #include <linux/crc32.h>
 #include <linux/jffs2.h>
 #include <linux/mtd/mtd.h>
+#include <linux/slab.h>
 #include "nodelist.h"
 #include "debug.h"
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index b47679be118a..6286ad9b00f7 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -103,9 +103,10 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
        jffs2_erase_failed(c, jeb, bad_offset);
 }
-void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
 {
        struct jffs2_eraseblock *jeb;
+        int work_done = 0;
        mutex_lock(&c->erase_free_sem);
@@ -121,6 +122,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
                        mutex_unlock(&c->erase_free_sem);
                        jffs2_mark_erased_block(c, jeb);
+                        work_done++;
                        if (!--count) {
                                D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n"));
                                goto done;
@@ -157,6 +159,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
        mutex_unlock(&c->erase_free_sem);
 done:
        D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n"));
+        return work_done;
 }
 static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
@@ -165,10 +168,11 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
        mutex_lock(&c->erase_free_sem);
        spin_lock(&c->erase_completion_lock);
        list_move_tail(&jeb->list, &c->erase_complete_list);
+        /* Wake the GC thread to mark them clean */
+        jffs2_garbage_collect_trigger(c);
        spin_unlock(&c->erase_completion_lock);
        mutex_unlock(&c->erase_free_sem);
-        /* Ensure that kupdated calls us again to mark them clean */
+        wake_up(&c->erase_wait);
-        jffs2_erase_pending_trigger(c);
 }
 static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
@@ -487,9 +491,9 @@ filebad:
 refile:
        /* Stick it back on the list from whence it came and come back later */
-        jffs2_erase_pending_trigger(c);
        mutex_lock(&c->erase_free_sem);
        spin_lock(&c->erase_completion_lock);
+        jffs2_garbage_collect_trigger(c);
        list_move(&jeb->list, &c->erase_complete_list);
        spin_unlock(&c->erase_completion_lock);
        mutex_unlock(&c->erase_free_sem);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index b7b74e299142..e7291c161a19 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -10,7 +10,6 @@
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/pagemap.h>
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 3451a81b2142..86e0821fc989 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -313,8 +313,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
        case S_IFBLK:
        case S_IFCHR:
                /* Read the device numbers from the media */
-                if (f->metadata->size != sizeof(jdev.old) &&
+                if (f->metadata->size != sizeof(jdev.old_id) &&
-                    f->metadata->size != sizeof(jdev.new)) {
+                    f->metadata->size != sizeof(jdev.new_id)) {
                        printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
                        goto error_io;
                }
@@ -325,10 +325,10 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
                        printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
                        goto error;
                }
-                if (f->metadata->size == sizeof(jdev.old))
+                if (f->metadata->size == sizeof(jdev.old_id))
-                        rdev = old_decode_dev(je16_to_cpu(jdev.old));
+                        rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
                else
-                        rdev = new_decode_dev(je32_to_cpu(jdev.new));
+                        rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
        case S_IFSOCK:
        case S_IFIFO:
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 3b6f2fa12cff..f5e96bd656e8 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -214,6 +214,19 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                return ret;
        }
+        /* If there are any blocks which need erasing, erase them now */
+        if (!list_empty(&c->erase_complete_list) ||
+            !list_empty(&c->erase_pending_list)) {
+                spin_unlock(&c->erase_completion_lock);
+                D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
+                if (jffs2_erase_pending_blocks(c, 1)) {
+                        mutex_unlock(&c->alloc_sem);
+                        return 0;
+                }
+                D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
+                spin_lock(&c->erase_completion_lock);
+        }
        /* First, work out which block we're garbage-collecting */
        jeb = c->gcblock;
@@ -222,7 +235,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
        if (!jeb) {
                /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */
-                if (!list_empty(&c->erase_pending_list)) {
+                if (c->nr_erasing_blocks) {
                        spin_unlock(&c->erase_completion_lock);
                        mutex_unlock(&c->alloc_sem);
                        return -EAGAIN;
@@ -435,7 +448,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                list_add_tail(&c->gcblock->list, &c->erase_pending_list);
                c->gcblock = NULL;
                c->nr_erasing_blocks++;
-                jffs2_erase_pending_trigger(c);
+                jffs2_garbage_collect_trigger(c);
        }
        spin_unlock(&c->erase_completion_lock);
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 87c6f555e1a0..af02bd138469 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -15,7 +15,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/rbtree.h>
 #include <linux/crc32.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 507ed6ec1847..a881a42f19e3 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -312,11 +312,11 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
 static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
 {
        if (old_valid_dev(rdev)) {
-                jdev->old = cpu_to_je16(old_encode_dev(rdev));
+                jdev->old_id = cpu_to_je16(old_encode_dev(rdev));
-                return sizeof(jdev->old);
+                return sizeof(jdev->old_id);
        } else {
-                jdev->new = cpu_to_je32(new_encode_dev(rdev));
+                jdev->new_id = cpu_to_je32(new_encode_dev(rdev));
-                return sizeof(jdev->new);
+                return sizeof(jdev->new_id);
        }
 }
@@ -464,7 +464,7 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 int jffs2_do_mount_fs(struct jffs2_sb_info *c);
 /* erase.c */
-void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
 void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 21a052915aa9..694aa5b03505 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -10,7 +10,6 @@
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/mtd/mtd.h>
 #include <linux/compiler.h>
 #include <linux/sched.h> /* For cond_resched() */
@@ -117,9 +116,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
                        ret = jffs2_garbage_collect_pass(c);
-                        if (ret == -EAGAIN)
+                        if (ret == -EAGAIN) {
-                                jffs2_erase_pending_blocks(c, 1);
+                                spin_lock(&c->erase_completion_lock);
-                        else if (ret)
+                                if (c->nr_erasing_blocks &&
+                                    list_empty(&c->erase_pending_list) &&
+                                    list_empty(&c->erase_complete_list)) {
+                                        DECLARE_WAITQUEUE(wait, current);
+                                        set_current_state(TASK_UNINTERRUPTIBLE);
+                                        add_wait_queue(&c->erase_wait, &wait);
+                                        D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__));
+                                        spin_unlock(&c->erase_completion_lock);
+                                        schedule();
+                                } else
+                                        spin_unlock(&c->erase_completion_lock);
+                        } else if (ret)
                                return ret;
                        cond_resched();
@@ -218,7 +229,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
                        ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
                        list_move_tail(&ejeb->list, &c->erase_pending_list);
                        c->nr_erasing_blocks++;
-                        jffs2_erase_pending_trigger(c);
+                        jffs2_garbage_collect_trigger(c);
                        D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
                                  ejeb->offset));
                }
@@ -470,7 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
 void jffs2_complete_reservation(struct jffs2_sb_info *c)
 {
        D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n"));
+        spin_lock(&c->erase_completion_lock);
        jffs2_garbage_collect_trigger(c);
+        spin_unlock(&c->erase_completion_lock);
        mutex_unlock(&c->alloc_sem);
 }
@@ -612,7 +625,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
                                D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
                                list_add_tail(&jeb->list, &c->erase_pending_list);
                                c->nr_erasing_blocks++;
-                                jffs2_erase_pending_trigger(c);
+                                jffs2_garbage_collect_trigger(c);
                        } else {
                                /* Sometimes, however, we leave it elsewhere so it doesn't get
                                   immediately reused, and we spread the load a bit. */
@@ -733,6 +746,10 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
        int nr_very_dirty = 0;
        struct jffs2_eraseblock *jeb;
+        if (!list_empty(&c->erase_complete_list) ||
+            !list_empty(&c->erase_pending_list))
+                return 1;
        if (c->unchecked_size) {
                D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
                          c->unchecked_size, c->checked_ino));
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index a7f03b7ebcb3..035a767f958b 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -140,8 +140,7 @@ void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
 #endif /* WRITEBUFFER */
-/* erase.c */
+static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
-static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c)
 {
        OFNI_BS_2SFFJ(c)->s_dirt = 1;
 }
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index e22de8397b74..d32ee9412cb9 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -567,7 +567,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
                        else BUG();
                }
        }
-        list->rb_node = NULL;
+        *list = RB_ROOT;
 }
 static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd)
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 696686cc206e..46f870d1cc36 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -260,7 +260,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
                        ret = -EIO;
                        goto out;
                }
-                jffs2_erase_pending_trigger(c);
+                spin_lock(&c->erase_completion_lock);
+                jffs2_garbage_collect_trigger(c);
+                spin_unlock(&c->erase_completion_lock);
        }
        ret = 0;
 out:
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9a80e8e595d0..511e2d609d12 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -63,8 +63,6 @@ static void jffs2_write_super(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
-                jffs2_garbage_collect_trigger(c);
-                jffs2_erase_pending_blocks(c, 0);
                jffs2_flush_wbuf_gc(c, 0);
        }
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 4ec11e8bda8c..b955626071c2 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -10,7 +10,6 @@
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5ef7bac265e5..07ee1546b2fa 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -84,7 +84,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
        struct jffs2_inodirty *new;
        /* Mark the superblock dirty so that kupdated will flush... */
-        jffs2_erase_pending_trigger(c);
+        jffs2_dirty_trigger(c);
        if (jffs2_wbuf_pending_for_ino(c, ino))
                return;
@@ -121,7 +121,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
                        D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
                        list_add_tail(&jeb->list, &c->erase_pending_list);
                        c->nr_erasing_blocks++;
-                        jffs2_erase_pending_trigger(c);
+                        jffs2_garbage_collect_trigger(c);
                } else {
                        /* Sometimes, however, we leave it elsewhere so it doesn't get
                           immediately reused, and we spread the load a bit. */
@@ -152,7 +152,7 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
                D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset));
                list_add(&jeb->list, &c->erase_pending_list);
                c->nr_erasing_blocks++;
-                jffs2_erase_pending_trigger(c);
+                jffs2_garbage_collect_trigger(c);
        }
        if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
@@ -543,7 +543,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
                D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
                list_move(&jeb->list, &c->erase_pending_list);
                c->nr_erasing_blocks++;
-                jffs2_erase_pending_trigger(c);
+                jffs2_garbage_collect_trigger(c);
        }
        jffs2_dbg_acct_sanity_check_nolock(c, jeb);
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index ca29440e9435..c819eb0e982d 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/crc32.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 213169780b6c..1057a4998e4e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -19,6 +19,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
 #include "jfs_incore.h"
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9dd126276c9f..ed9ba6fe04f5 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -61,7 +61,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
                        inode->i_op = &page_symlink_inode_operations;
                        inode->i_mapping->a_ops = &jfs_aops;
                } else {
-                        inode->i_op = &jfs_symlink_inode_operations;
+                        inode->i_op = &jfs_fast_symlink_inode_operations;
                        /*
                         * The inline data should be null-terminated, but
                         * don't let on-disk corruption crash the kernel
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d9b031cf69f5..c92ea3b3ea5e 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -17,6 +17,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_dmap.h"
@@ -195,7 +196,7 @@ int dbMount(struct inode *ipbmap)
        bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
        bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
        bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
-        bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth);
+        bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
        bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
        bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
        bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
@@ -287,7 +288,7 @@ int dbSync(struct inode *ipbmap)
        dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
        dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
        dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
-        dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth);
+        dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight);
        dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
        dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
        dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
@@ -1440,7 +1441,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
         * tree index of this allocation group within the control page.
         */
        agperlev =
-            (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth;
+            (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
        ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
        /* dmap control page trees fan-out by 4 and a single allocation
@@ -1459,7 +1460,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
                 * the subtree to find the leftmost leaf that describes this
                 * free space.
                 */
-                for (k = bmp->db_agheigth; k > 0; k--) {
+                for (k = bmp->db_agheight; k > 0; k--) {
                        for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
                                if (l2nb <= dcp->stree[m + n]) {
                                        ti = m + n;
@@ -2437,7 +2438,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
        /* check if this is a control page update for an allocation.
         * if so, update the leaf to reflect the new leaf value using
-         * dbSplit(); otherwise (deallocation), use dbJoin() to udpate
+         * dbSplit(); otherwise (deallocation), use dbJoin() to update
         * the leaf with the new value.  in addition to updating the
         * leaf, dbSplit() will also split the binary buddy system of
         * the leaves, if required, and bubble new values within the
@@ -3606,7 +3607,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
        }
        /*
-         * compute db_aglevel, db_agheigth, db_width, db_agstart:
+         * compute db_aglevel, db_agheight, db_width, db_agstart:
         * an ag is covered in aglevel dmapctl summary tree,
         * at agheight level height (from leaf) with agwidth number of nodes
         * each, which starts at agstart index node of the smmary tree node
@@ -3615,9 +3616,9 @@ void dbFinalizeBmap(struct inode *ipbmap)
        bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
        l2nl =
            bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
-        bmp->db_agheigth = l2nl >> 1;
+        bmp->db_agheight = l2nl >> 1;
-        bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1));
+        bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1));
-        for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0;
+        for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0;
             i--) {
                bmp->db_agstart += n;
                n <<= 2;
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 1a6eb41569bc..6dcb906c55d8 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -210,7 +210,7 @@ struct dbmap_disk {
        __le32 dn_maxag;        /* 4: max active alloc group number     */
        __le32 dn_agpref;       /* 4: preferred alloc group (hint)      */
        __le32 dn_aglevel;      /* 4: dmapctl level holding the AG      */
-        __le32 dn_agheigth;     /* 4: height in dmapctl of the AG       */
+        __le32 dn_agheight;     /* 4: height in dmapctl of the AG       */
        __le32 dn_agwidth;      /* 4: width in dmapctl of the AG        */
        __le32 dn_agstart;      /* 4: start tree index at AG height     */
        __le32 dn_agl2size;     /* 4: l2 num of blks per alloc group    */
@@ -229,7 +229,7 @@ struct dbmap {
        int dn_maxag;           /* max active alloc group number        */
        int dn_agpref;          /* preferred alloc group (hint)         */
        int dn_aglevel;         /* dmapctl level holding the AG         */
-        int dn_agheigth;        /* height in dmapctl of the AG          */
+        int dn_agheight;        /* height in dmapctl of the AG          */
        int dn_agwidth;         /* width in dmapctl of the AG           */
        int dn_agstart;         /* start tree index at AG height        */
        int dn_agl2size;        /* l2 num of blks per alloc group       */
@@ -255,7 +255,7 @@ struct bmap {
 #define db_agsize       db_bmap.dn_agsize
 #define db_agl2size     db_bmap.dn_agl2size
 #define db_agwidth      db_bmap.dn_agwidth
-#define db_agheigth     db_bmap.dn_agheigth
+#define db_agheight     db_bmap.dn_agheight
 #define db_agstart      db_bmap.dn_agstart
 #define db_numag        db_bmap.dn_numag
 #define db_maxlevel     db_bmap.dn_maxlevel
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0e4623be70ce..9197a1b0d02d 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -102,6 +102,7 @@
 #include <linux/fs.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_filsys.h"
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 0fc30407f039..f8332dc8eeb2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -45,6 +45,7 @@
 #include <linux/buffer_head.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 79e2c79661df..9e6bda30a6e8 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -48,5 +48,6 @@ extern const struct file_operations jfs_dir_operations;
 extern const struct inode_operations jfs_file_inode_operations;
 extern const struct file_operations jfs_file_operations;
 extern const struct inode_operations jfs_symlink_inode_operations;
+extern const struct inode_operations jfs_fast_symlink_inode_operations;
 extern const struct dentry_operations jfs_ci_dentry_operations;
 #endif                          /* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 335c4de6552d..c51af2a14516 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -70,6 +70,7 @@
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 07b6c5dfb4b6..48b44bd8267b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/mempool.h>
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 3fbb3a225590..8f0f02cb6ca6 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h
@@ -19,6 +19,7 @@
 #ifndef _H_JFS_UNICODE
 #define _H_JFS_UNICODE
+#include <linux/slab.h>
 #include <asm/byteorder.h>
 #include "jfs_types.h"
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4a3e9f39c21d..a9cf8e8675be 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -956,7 +956,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
         */
        if (ssize <= IDATASIZE) {
-                ip->i_op = &jfs_symlink_inode_operations;
+                ip->i_op = &jfs_fast_symlink_inode_operations;
                i_fastsymlink = JFS_IP(ip)->i_inline;
                memcpy(i_fastsymlink, name, ssize);
@@ -978,7 +978,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
        else {
                jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
-                ip->i_op = &page_symlink_inode_operations;
+                ip->i_op = &jfs_symlink_inode_operations;
                ip->i_mapping->a_ops = &jfs_aops;
                /*
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 7f24a0bb08ca..1aba0039f1c9 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -81,6 +81,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
        struct inode *iplist[1];
        struct jfs_superblock *j_sb, *j_sb2;
        uint old_agsize;
+        int agsizechanged = 0;
        struct buffer_head *bh, *bh2;
        /* If the volume hasn't grown, get out now */
@@ -333,6 +334,9 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
         */
        if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
                goto error_out;
+        agsizechanged |= (bmp->db_agsize != old_agsize);
        /*
         * the map now has extended to cover additional nblocks:
         * dn_mapsize = oldMapsize + nblocks;
@@ -432,7 +436,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
         * will correctly identify the new ag);
         */
        /* if new AG size the same as old AG size, done! */
-        if (bmp->db_agsize != old_agsize) {
+        if (agsizechanged) {
                if ((rc = diExtendFS(ipimap, ipbmap)))
                        goto error_out;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 266699deb1c6..b66832ac33ac 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -30,6 +30,7 @@
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/smp_lock.h>
@@ -445,10 +446,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        /* initialize the mount flag and determine the default error handler */
        flag = JFS_ERR_REMOUNT_RO;
-        if (!parse_options((char *) data, sb, &newLVSize, &flag)) {
+        if (!parse_options((char *) data, sb, &newLVSize, &flag))
-                kfree(sbi);
+                goto out_kfree;
-                return -EINVAL;
-        }
        sbi->flag = flag;
 #ifdef CONFIG_JFS_POSIX_ACL
@@ -457,7 +456,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        if (newLVSize) {
                printk(KERN_ERR "resize option for remount only\n");
-                return -EINVAL;
+                goto out_kfree;
        }
        /*
@@ -477,7 +476,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        inode = new_inode(sb);
        if (inode == NULL) {
                ret = -ENOMEM;
-                goto out_kfree;
+                goto out_unload;
        }
        inode->i_ino = 0;
        inode->i_nlink = 1;
@@ -549,9 +548,10 @@ out_mount_failed:
        make_bad_inode(sbi->direct_inode);
        iput(sbi->direct_inode);
        sbi->direct_inode = NULL;
-out_kfree:
+out_unload:
        if (sbi->nls_tab)
                unload_nls(sbi->nls_tab);
+out_kfree:
        kfree(sbi);
        return ret;
 }
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 4af1a05aad0a..205b946d8e0d 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -29,9 +29,21 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
        return NULL;
 }
-const struct inode_operations jfs_symlink_inode_operations = {
+const struct inode_operations jfs_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = jfs_follow_link,
+        .setattr        = jfs_setattr,
+        .setxattr       = jfs_setxattr,
+        .getxattr       = jfs_getxattr,
+        .listxattr      = jfs_listxattr,
+        .removexattr    = jfs_removexattr,
+};
+const struct inode_operations jfs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+        .setattr        = jfs_setattr,
        .setxattr       = jfs_setxattr,
        .getxattr       = jfs_getxattr,
        .listxattr      = jfs_listxattr,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 1f594ab21895..fa96bbb26343 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
 #include <linux/quotaops.h>
 #include <linux/security.h>
 #include "jfs_incore.h"
diff --git a/fs/libfs.c b/fs/libfs.c
index 9e50bcf55857..232bea425b09 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/mutex.h>
@@ -546,6 +547,40 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
 }
 /**
+ * simple_write_to_buffer - copy data from user space to the buffer
+ * @to: the buffer to write to
+ * @available: the size of the buffer
+ * @ppos: the current position in the buffer
+ * @from: the user space buffer to read from
+ * @count: the maximum number of bytes to read
+ *
+ * The simple_write_to_buffer() function reads up to @count bytes from the user
+ * space address starting at @from into the buffer @to at offset @ppos.
+ *
+ * On success, the number of bytes written is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
+ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
+                const void __user *from, size_t count)
+{
+        loff_t pos = *ppos;
+        size_t res;
+        if (pos < 0)
+                return -EINVAL;
+        if (pos >= available || !count)
+                return 0;
+        if (count > available - pos)
+                count = available - pos;
+        res = copy_from_user(to + pos, from, count);
+        if (res == count)
+                return -EFAULT;
+        count -= res;
+        *ppos = pos + count;
+        return count;
+}
+/**
 * memory_read_from_buffer - copy data from the buffer
 * @to: the kernel space buffer to read to
 * @count: the maximum number of bytes to read
@@ -863,6 +898,7 @@ EXPORT_SYMBOL(simple_statfs);
 EXPORT_SYMBOL(simple_sync_file);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
+EXPORT_SYMBOL(simple_write_to_buffer);
 EXPORT_SYMBOL(memory_read_from_buffer);
 EXPORT_SYMBOL(simple_transaction_set);
 EXPORT_SYMBOL(simple_transaction_get);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index fc9032dc8862..64fd427c993c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/nfs_fs.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c81249fef11f..7932c399fab4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index fefa4df3f005..e3015464fbab 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -10,6 +10,7 @@
 #include <linux/utsname.h>
 #include <linux/kernel.h>
 #include <linux/ktime.h>
+#include <linux/slab.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprtsock.h>
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 7d150517ddf0..f1bacf1a0391 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -21,7 +21,6 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/uio.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/mutex.h>
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a7966eed3c17..031c6569a134 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d1001790fa9a..84055d31bfc5 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -21,6 +21,7 @@
 */
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 56c9519d900a..0f2ab741ae7c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index ad478da7ca63..d0ef94cfb3da 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 #include <linux/time.h>
 #include <linux/in.h>
+#include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/locks.c b/fs/locks.c
index ae9ded026b7c..ab24d49fc048 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1455,7 +1455,7 @@ EXPORT_SYMBOL(generic_setlease);
 *      leases held by processes on this node.
 *
 *      There is also no break_lease method; filesystems that
- *      handle their own leases shoud break leases themselves from the
+ *      handle their own leases should break leases themselves from the
 *      filesystem's open, create, and (on truncate) setattr methods.
 *
 *      Warning: the only current setlease methods exist only to disable
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9718c22f186d..9bd2ce2a3040 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -9,6 +9,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
@@ -80,6 +81,7 @@ static void writeseg_end_io(struct bio *bio, int err)
                        prefetchw(&bvec->bv_page->flags);
                end_page_writeback(page);
+                page_cache_release(page);
        } while (bvec >= bio->bi_io_vec);
        bio_put(bio);
        if (atomic_dec_and_test(&super->s_pending_writes))
@@ -97,8 +99,10 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
        unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
        int i;
+        if (max_pages > BIO_MAX_PAGES)
+                max_pages = BIO_MAX_PAGES;
        bio = bio_alloc(GFP_NOFS, max_pages);
-        BUG_ON(!bio); /* FIXME: handle this */
+        BUG_ON(!bio);
        for (i = 0; i < nr_pages; i++) {
                if (i >= max_pages) {
@@ -191,8 +195,10 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
        unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
        int i;
+        if (max_pages > BIO_MAX_PAGES)
+                max_pages = BIO_MAX_PAGES;
        bio = bio_alloc(GFP_NOFS, max_pages);
-        BUG_ON(!bio); /* FIXME: handle this */
+        BUG_ON(!bio);
        for (i = 0; i < nr_pages; i++) {
                if (i >= max_pages) {
@@ -297,6 +303,11 @@ static void bdev_put_device(struct super_block *sb)
        close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
 }
+static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
+{
+        return 0;
+}
 static const struct logfs_device_ops bd_devops = {
        .find_first_sb  = bdev_find_first_sb,
        .find_last_sb   = bdev_find_last_sb,
@@ -304,6 +315,7 @@ static const struct logfs_device_ops bd_devops = {
        .readpage       = bdev_readpage,
        .writeseg       = bdev_writeseg,
        .erase          = bdev_erase,
+        .can_write_buf  = bdev_can_write_buf,
        .sync           = bdev_sync,
        .put_device     = bdev_put_device,
 };
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index cafb6ef2e05b..a85d47d13e4b 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -9,6 +9,7 @@
 #include <linux/completion.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
@@ -126,7 +127,8 @@ static int mtd_readpage(void *_sb, struct page *page)
        err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
                        page_address(page));
-        if (err == -EUCLEAN) {
+        if (err == -EUCLEAN || err == -EBADMSG) {
+                /* -EBADMSG happens regularly on power failures */
                err = 0;
                /* FIXME: force GC this segment */
        }
@@ -233,12 +235,32 @@ static void mtd_put_device(struct super_block *sb)
        put_mtd_device(logfs_super(sb)->s_mtd);
 }
+static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
+{
+        struct logfs_super *super = logfs_super(sb);
+        void *buf;
+        int err;
+        buf = kmalloc(super->s_writesize, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        err = mtd_read(sb, ofs, super->s_writesize, buf);
+        if (err)
+                goto out;
+        if (memchr_inv(buf, 0xff, super->s_writesize))
+                err = -EIO;
+        kfree(buf);
+out:
+        return err;
+}
 static const struct logfs_device_ops mtd_devops = {
        .find_first_sb  = mtd_find_first_sb,
        .find_last_sb   = mtd_find_last_sb,
        .readpage       = mtd_readpage,
        .writeseg       = mtd_writeseg,
        .erase          = mtd_erase,
+        .can_write_buf  = mtd_can_write_buf,
        .sync           = mtd_sync,
        .put_device     = mtd_put_device,
 };
@@ -250,5 +272,7 @@ int logfs_get_sb_mtd(struct file_system_type *type, int flags,
        const struct logfs_device_ops *devops = &mtd_devops;
        mtd = get_mtd_device(NULL, mtdnr);
+        if (IS_ERR(mtd))
+                return PTR_ERR(mtd);
        return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
 }
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 56a8bfbb0120..72d1893ddd36 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -6,13 +6,13 @@
 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
 */
 #include "logfs.h"
+#include <linux/slab.h>
 /*
 * Atomic dir operations
 *
 * Directory operations are by default not atomic.  Dentries and Inodes are
- * created/removed/altered in seperate operations.  Therefore we need to do
+ * created/removed/altered in separate operations.  Therefore we need to do
 * a small amount of journaling.
 *
 * Create, link, mkdir, mknod and symlink all share the same function to do
@@ -303,12 +303,12 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
                                (filler_t *)logfs_readpage, NULL);
                if (IS_ERR(page))
                        return PTR_ERR(page);
-                dd = kmap_atomic(page, KM_USER0);
+                dd = kmap(page);
                BUG_ON(dd->namelen == 0);
                full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
                                pos, be64_to_cpu(dd->ino), dd->type);
-                kunmap_atomic(dd, KM_USER0);
+                kunmap(page);
                page_cache_release(page);
                if (full)
                        break;
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 370f367a933e..0de524071870 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -161,7 +161,17 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
 static void logfs_invalidatepage(struct page *page, unsigned long offset)
 {
-        move_page_to_btree(page);
+        struct logfs_block *block = logfs_block(page);
+        if (block->reserved_bytes) {
+                struct super_block *sb = page->mapping->host->i_sb;
+                struct logfs_super *super = logfs_super(sb);
+                super->s_dirty_pages -= block->reserved_bytes;
+                block->ops->free_block(sb, block);
+                BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR));
+        } else
+                move_page_to_btree(page);
        BUG_ON(PagePrivate(page) || page->private);
 }
@@ -212,10 +222,8 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
        struct super_block *sb = dentry->d_inode->i_sb;
-        struct logfs_super *super = logfs_super(sb);
-        /* FIXME: write anchor */
+        logfs_write_anchor(sb);
-        super->s_devops->sync(sb);
        return 0;
 }
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index 92949f95a901..caa4419285dc 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -7,6 +7,7 @@
 */
 #include "logfs.h"
 #include <linux/sched.h>
+#include <linux/slab.h>
 /*
 * Wear leveling needs to kick in when the difference between low erase
@@ -121,7 +122,7 @@ static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
        logfs_safe_iput(inode, cookie);
 }
-static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
+static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
 {
        struct logfs_super *super = logfs_super(sb);
        struct logfs_segment_header sh;
@@ -400,7 +401,7 @@ static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
                        segno, (u64)segno << super->s_segshift,
                        dist, no_free_segments(sb), valid,
                        super->s_free_bytes);
-        cleaned = logfs_gc_segment(sb, segno, dist);
+        cleaned = logfs_gc_segment(sb, segno);
        log_gc("GC segment #%02x complete - now %x valid\n", segno,
                        valid - cleaned);
        BUG_ON(cleaned != valid);
@@ -458,6 +459,14 @@ static void __logfs_gc_pass(struct super_block *sb, int target)
        struct logfs_block *block;
        int round, progress, last_progress = 0;
+        /*
+         * Doing too many changes to the segfile at once would result
+         * in a large number of aliases.  Write the journal before
+         * things get out of hand.
+         */
+        if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
+                logfs_write_anchor(sb);
        if (no_free_segments(sb) >= target &&
                        super->s_no_object_aliases < MAX_OBJ_ALIASES)
                return;
@@ -623,38 +632,31 @@ static int check_area(struct super_block *sb, int i)
 {
        struct logfs_super *super = logfs_super(sb);
        struct logfs_area *area = super->s_area[i];
-        struct logfs_object_header oh;
+        gc_level_t gc_level;
+        u32 cleaned, valid, ec;
        u32 segno = area->a_segno;
-        u32 ofs = area->a_used_bytes;
+        u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
-        __be32 crc;
-        int err;
        if (!area->a_is_open)
                return 0;
-        for (ofs = area->a_used_bytes;
+        if (super->s_devops->can_write_buf(sb, ofs) == 0)
-             ofs <= super->s_segsize - sizeof(oh);
+                return 0;
-             ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
-                err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
-                if (err)
-                        return err;
-                if (!memchr_inv(&oh, 0xff, sizeof(oh)))
-                        break;
-                crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
+        printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
-                if (crc != oh.crc) {
+        /*
-                        printk(KERN_INFO "interrupted header at %llx\n",
+         * The device cannot write back the write buffer.  Most likely the
-                                        dev_ofs(sb, segno, ofs));
+         * wbuf was already written out and the system crashed at some point
-                        return 0;
+         * before the journal commit happened.  In that case we wouldn't have
-                }
+         * to do anything.  But if the crash happened before the wbuf was
-        }
+         * written out correctly, we must GC this segment.  So assume the
-        if (ofs != area->a_used_bytes) {
+         * worst and always do the GC run.
-                printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
+         */
-                                ofs - area->a_used_bytes,
+        area->a_is_open = 0;
-                                dev_ofs(sb, segno, area->a_used_bytes));
+        valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
-                area->a_used_bytes = ofs;
+        cleaned = logfs_gc_segment(sb, segno);
-        }
+        if (cleaned != valid)
+                return -EIO;
        return 0;
 }
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 33ec1aeaeec4..755a92e8daa7 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -6,6 +6,7 @@
 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
 */
 #include "logfs.h"
+#include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
@@ -192,6 +193,7 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode)
        inode->i_ctime  = CURRENT_TIME;
        inode->i_mtime  = CURRENT_TIME;
        inode->i_nlink  = 1;
+        li->li_refcount = 1;
        INIT_LIST_HEAD(&li->li_freeing_list);
        for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
@@ -325,7 +327,7 @@ static void logfs_set_ino_generation(struct super_block *sb,
        u64 ino;
        mutex_lock(&super->s_journal_mutex);
-        ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
+        ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1);
        super->s_last_ino = ino;
        super->s_inos_till_wrap--;
        if (super->s_inos_till_wrap < 0) {
@@ -385,8 +387,7 @@ static void logfs_init_once(void *_li)
 static int logfs_sync_fs(struct super_block *sb, int wait)
 {
-        /* FIXME: write anchor */
+        logfs_write_anchor(sb);
-        logfs_super(sb)->s_devops->sync(sb);
        return 0;
 }
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 6ad30a4c9052..4b0e0616b357 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -6,6 +6,7 @@
 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
 */
 #include "logfs.h"
+#include <linux/slab.h>
 static void logfs_calc_free(struct super_block *sb)
 {
@@ -131,10 +132,9 @@ static int read_area(struct super_block *sb, struct logfs_je_area *a)
        ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
        if (super->s_writesize > 1)
-                logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
+                return logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
        else
-                logfs_buf_recover(area, ofs, NULL, 0);
+                return logfs_buf_recover(area, ofs, NULL, 0);
-        return 0;
 }
 static void *unpack(void *from, void *to)
@@ -244,7 +244,7 @@ static int read_je(struct super_block *sb, u64 ofs)
                read_erasecount(sb, unpack(jh, scratch));
                break;
        case JE_AREA:
-                read_area(sb, unpack(jh, scratch));
+                err = read_area(sb, unpack(jh, scratch));
                break;
        case JE_OBJ_ALIAS:
                err = logfs_load_object_aliases(sb, unpack(jh, scratch),
@@ -388,7 +388,10 @@ static void journal_get_erase_count(struct logfs_area *area)
 static int journal_erase_segment(struct logfs_area *area)
 {
        struct super_block *sb = area->a_sb;
-        struct logfs_segment_header sh;
+        union {
+                struct logfs_segment_header sh;
+                unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
+        } u;
        u64 ofs;
        int err;
@@ -396,20 +399,21 @@ static int journal_erase_segment(struct logfs_area *area)
        if (err)
                return err;
-        sh.pad = 0;
+        memset(&u, 0, sizeof(u));
-        sh.type = SEG_JOURNAL;
+        u.sh.pad = 0;
-        sh.level = 0;
+        u.sh.type = SEG_JOURNAL;
-        sh.segno = cpu_to_be32(area->a_segno);
+        u.sh.level = 0;
-        sh.ec = cpu_to_be32(area->a_erase_count);
+        u.sh.segno = cpu_to_be32(area->a_segno);
-        sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+        u.sh.ec = cpu_to_be32(area->a_erase_count);
-        sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
+        u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+        u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
        /* This causes a bug in segment.c.  Not yet. */
        //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
        ofs = dev_ofs(sb, area->a_segno, 0);
-        area->a_used_bytes = ALIGN(sizeof(sh), 16);
+        area->a_used_bytes = sizeof(u);
-        logfs_buf_write(area, ofs, &sh, sizeof(sh));
+        logfs_buf_write(area, ofs, &u, sizeof(u));
        return 0;
 }
@@ -493,6 +497,8 @@ static void account_shadows(struct super_block *sb)
        btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
        btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
+        btree_grim_visitor32(&tree->segment_map, 0, NULL);
+        tree->no_shadowed_segments = 0;
        if (li->li_block) {
                /*
@@ -606,9 +612,9 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
        if (len == 0)
                return logfs_write_header(super, header, 0, type);
+        BUG_ON(len > sb->s_blocksize);
        compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
        if (compr_len < 0 || type == JE_ANCHOR) {
-                BUG_ON(len > sb->s_blocksize);
                memcpy(data, buf, len);
                compr_len = len;
                compr = COMPR_NONE;
@@ -660,6 +666,7 @@ static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
        if (ofs < 0)
                return ofs;
        logfs_buf_write(area, ofs, super->s_compressed_je, len);
+        BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
        super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
        return 0;
 }
@@ -800,6 +807,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
 {
        struct logfs_super *super = logfs_super(sb);
        struct logfs_area *area = super->s_journal_area;
+        struct btree_head32 *head = &super->s_reserved_segments;
        u32 segno, ec;
        int i, err;
@@ -807,6 +815,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
        /* Drop old segments */
        journal_for_each(i)
                if (super->s_journal_seg[i]) {
+                        btree_remove32(head, super->s_journal_seg[i]);
                        logfs_set_segment_unreserved(sb,
                                        super->s_journal_seg[i],
                                        super->s_journal_ec[i]);
@@ -819,8 +828,13 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
                super->s_journal_seg[i] = segno;
                super->s_journal_ec[i] = ec;
                logfs_set_segment_reserved(sb, segno);
+                err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
+                BUG_ON(err); /* mempool should prevent this */
+                err = logfs_erase_segment(sb, segno, 1);
+                BUG_ON(err); /* FIXME: remount-ro would be nicer */
        }
        /* Manually move journal_area */
+        freeseg(sb, area->a_segno);
        area->a_segno = super->s_journal_seg[0];
        area->a_is_open = 0;
        area->a_used_bytes = 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 129779431373..1a9db84f8d8f 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -144,6 +144,7 @@ struct logfs_area_ops {
 * @erase:                      erase one segment
 * @read:                       read from the device
 * @erase:                      erase part of the device
+ * @can_write_buf:              decide whether wbuf can be written to ofs
 */
 struct logfs_device_ops {
        struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
@@ -153,6 +154,7 @@ struct logfs_device_ops {
        void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
        int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
                        int ensure_write);
+        int (*can_write_buf)(struct super_block *sb, u64 ofs);
        void (*sync)(struct super_block *sb);
        void (*put_device)(struct super_block *sb);
 };
@@ -257,10 +259,14 @@ struct logfs_shadow {
 * struct shadow_tree
 * @new:                        shadows where old_ofs==0, indexed by new_ofs
 * @old:                        shadows where old_ofs!=0, indexed by old_ofs
+ * @segment_map:                bitfield of segments containing shadows
+ * @no_shadowed_segment:        number of segments containing shadows
 */
 struct shadow_tree {
        struct btree_head64 new;
        struct btree_head64 old;
+        struct btree_head32 segment_map;
+        int no_shadowed_segments;
 };
 struct object_alias_item {
@@ -305,13 +311,14 @@ typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
                level_t level, int child_no, __be64 val);
 struct logfs_block_ops {
        void    (*write_block)(struct logfs_block *block);
-        gc_level_t      (*block_level)(struct logfs_block *block);
        void    (*free_block)(struct super_block *sb, struct logfs_block*block);
        int     (*write_alias)(struct super_block *sb,
                        struct logfs_block *block,
                        write_alias_t *write_one_alias);
 };
+#define MAX_JOURNAL_ENTRIES 256
 struct logfs_super {
        struct mtd_info *s_mtd;                 /* underlying device */
        struct block_device *s_bdev;            /* underlying device */
@@ -378,7 +385,7 @@ struct logfs_super {
        u32      s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
        u64      s_last_version;
        struct logfs_area *s_journal_area;      /* open journal segment */
-        __be64  s_je_array[64];
+        __be64  s_je_array[MAX_JOURNAL_ENTRIES];
        int     s_no_je;
        int      s_sum_index;                   /* for the 12 summaries */
@@ -389,6 +396,7 @@ struct logfs_super {
        int      s_lock_count;
        mempool_t *s_block_pool;                /* struct logfs_block pool */
        mempool_t *s_shadow_pool;               /* struct logfs_shadow pool */
+        struct list_head s_writeback_list;      /* writeback pages */
        /*
         * Space accounting:
         * - s_used_bytes specifies space used to store valid data objects.
@@ -587,24 +595,25 @@ void move_page_to_btree(struct page *page);
 int logfs_init_mapping(struct super_block *sb);
 void logfs_sync_area(struct logfs_area *area);
 void logfs_sync_segments(struct super_block *sb);
+void freeseg(struct super_block *sb, u32 segno);
 /* area handling */
 int logfs_init_areas(struct super_block *sb);
 void logfs_cleanup_areas(struct super_block *sb);
 int logfs_open_area(struct logfs_area *area, size_t bytes);
-void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
                int use_filler);
-static inline void logfs_buf_write(struct logfs_area *area, u64 ofs,
+static inline int logfs_buf_write(struct logfs_area *area, u64 ofs,
                void *buf, size_t len)
 {
-        __logfs_buf_write(area, ofs, buf, len, 0);
+        return __logfs_buf_write(area, ofs, buf, len, 0);
 }
-static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs,
+static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
                void *buf, size_t len)
 {
-        __logfs_buf_write(area, ofs, buf, len, 1);
+        return __logfs_buf_write(area, ofs, buf, len, 1);
 }
 /* super.c */
@@ -698,7 +707,7 @@ static inline gc_level_t expand_level(u64 ino, level_t __level)
        u8 level = (__force u8)__level;
        if (ino == LOGFS_INO_MASTER) {
-                /* ifile has seperate areas */
+                /* ifile has separate areas */
                level += LOGFS_MAX_LEVELS;
        }
        return (__force gc_level_t)level;
@@ -721,4 +730,10 @@ static inline struct logfs_area *get_area(struct super_block *sb,
        return logfs_super(sb)->s_area[(__force u8)gc_level];
 }
+static inline void logfs_mempool_destroy(mempool_t *pool)
+{
+        if (pool)
+                mempool_destroy(pool);
+}
 #endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
index f674725663fe..ae960519c54a 100644
--- a/fs/logfs/logfs_abi.h
+++ b/fs/logfs/logfs_abi.h
@@ -50,9 +50,9 @@ static inline void check_##type(void)				\
 * 12   - gc recycled blocks, long-lived data
 * 13   - replacement blocks, short-lived data
 *
- * Levels 1-11 are necessary for robust gc operations and help seperate
+ * Levels 1-11 are necessary for robust gc operations and help separate
 * short-lived metadata from longer-lived file data.  In the future,
- * file data should get seperated into several segments based on simple
+ * file data should get separated into several segments based on simple
 * heuristics.  Old data recycled during gc operation is expected to be
 * long-lived.  New data is of uncertain life expectancy.  New data
 * used to replace older blocks in existing files is expected to be
@@ -117,7 +117,7 @@ static inline void check_##type(void)				\
 #define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
 /*
- * LogFS needs to seperate data into levels.  Each level is defined as the
+ * LogFS needs to separate data into levels.  Each level is defined as the
 * maximal possible distance from the master inode (inode of the inode file).
 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
@@ -204,7 +204,7 @@ SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
 * @ds_crc:                     crc32 of structure starting with the next field
 * @ds_ifile_levels:            maximum number of levels for ifile
 * @ds_iblock_levels:           maximum number of levels for regular files
- * @ds_data_levels:             number of seperate levels for data
+ * @ds_data_levels:             number of separate levels for data
 * @pad0:                       reserved, must be 0
 * @ds_feature_incompat:        incompatible filesystem features
 * @ds_feature_ro_compat:       read-only compatible filesystem features
@@ -456,7 +456,7 @@ enum logfs_vim {
 * @vim:                        life expectancy of data
 *
 * "Areas" are segments currently being used for writing.  There is at least
- * one area per GC level.  Several may be used to seperate long-living from
+ * one area per GC level.  Several may be used to separate long-living from
 * short-living data.  If an area with unknown vim is encountered, it can
 * simply be closed.
 * The write buffer immediately follow this header.
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 7a23b3e7c0a7..0718d112a1a5 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -18,6 +18,7 @@
 */
 #include "logfs.h"
 #include <linux/sched.h>
+#include <linux/slab.h>
 static u64 adjust_bix(u64 bix, level_t level)
 {
@@ -429,25 +430,6 @@ static void inode_write_block(struct logfs_block *block)
        }
 }
-static gc_level_t inode_block_level(struct logfs_block *block)
-{
-        BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
-        return GC_LEVEL(LOGFS_MAX_LEVELS);
-}
-static gc_level_t indirect_block_level(struct logfs_block *block)
-{
-        struct page *page;
-        struct inode *inode;
-        u64 bix;
-        level_t level;
-        page = block->page;
-        inode = page->mapping->host;
-        logfs_unpack_index(page->index, &bix, &level);
-        return expand_level(inode->i_ino, level);
-}
 /*
 * This silences a false, yet annoying gcc warning.  I hate it when my editor
 * jumps into bitops.h each time I recompile this file.
@@ -586,14 +568,12 @@ static void indirect_free_block(struct super_block *sb,
 static struct logfs_block_ops inode_block_ops = {
        .write_block = inode_write_block,
-        .block_level = inode_block_level,
        .free_block = inode_free_block,
        .write_alias = inode_write_alias,
 };
 struct logfs_block_ops indirect_block_ops = {
        .write_block = indirect_write_block,
-        .block_level = indirect_block_level,
        .free_block = indirect_free_block,
        .write_alias = indirect_write_alias,
 };
@@ -912,6 +892,8 @@ u64 logfs_seek_hole(struct inode *inode, u64 bix)
                return bix;
        else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
                bix = maxbix(li->li_height);
+        else if (bix >= maxbix(li->li_height))
+                return bix;
        else {
                bix = seek_holedata_loop(inode, bix, 0);
                if (bix < maxbix(li->li_height))
@@ -1113,17 +1095,25 @@ static int logfs_reserve_bytes(struct inode *inode, int bytes)
 int get_page_reserve(struct inode *inode, struct page *page)
 {
        struct logfs_super *super = logfs_super(inode->i_sb);
+        struct logfs_block *block = logfs_block(page);
        int ret;
-        if (logfs_block(page) && logfs_block(page)->reserved_bytes)
+        if (block && block->reserved_bytes)
                return 0;
        logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
-        ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
+        while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) &&
+                        !list_empty(&super->s_writeback_list)) {
+                block = list_entry(super->s_writeback_list.next,
+                                struct logfs_block, alias_list);
+                block->ops->write_block(block);
+        }
        if (!ret) {
                alloc_data_block(inode, page);
-                logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
+                block = logfs_block(page);
+                block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
                super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
+                list_move_tail(&block->alias_list, &super->s_writeback_list);
        }
        logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
        return ret;
@@ -1240,6 +1230,18 @@ static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
        mempool_free(shadow, super->s_shadow_pool);
 }
+static void mark_segment(struct shadow_tree *tree, u32 segno)
+{
+        int err;
+        if (!btree_lookup32(&tree->segment_map, segno)) {
+                err = btree_insert32(&tree->segment_map, segno, (void *)1,
+                                GFP_NOFS);
+                BUG_ON(err);
+                tree->no_shadowed_segments++;
+        }
+}
 /**
 * fill_shadow_tree - Propagate shadow tree changes due to a write
 * @inode:      Inode owning the page
@@ -1287,6 +1289,8 @@ static void fill_shadow_tree(struct inode *inode, struct page *page,
                super->s_dirty_used_bytes += shadow->new_len;
                super->s_dirty_free_bytes += shadow->old_len;
+                mark_segment(tree, shadow->old_ofs >> super->s_segshift);
+                mark_segment(tree, shadow->new_ofs >> super->s_segshift);
        }
 }
@@ -1594,7 +1598,6 @@ int logfs_delete(struct inode *inode, pgoff_t index,
        return ret;
 }
-/* Rewrite cannot mark the inode dirty but has to write it immediatly. */
 int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
                gc_level_t gc_level, long flags)
 {
@@ -1611,6 +1614,18 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
                if (level != 0)
                        alloc_indirect_block(inode, page, 0);
                err = logfs_write_buf(inode, page, flags);
+                if (!err && shrink_level(gc_level) == 0) {
+                        /* Rewrite cannot mark the inode dirty but has to
+                         * write it immediatly.
+                         * Q: Can't we just create an alias for the inode
+                         * instead?  And if not, why not?
+                         */
+                        if (inode->i_ino == LOGFS_INO_MASTER)
+                                logfs_write_anchor(inode->i_sb);
+                        else {
+                                err = __logfs_write_inode(inode, flags);
+                        }
+                }
        }
        logfs_put_write_page(page);
        return err;
@@ -1833,19 +1848,37 @@ static int __logfs_truncate(struct inode *inode, u64 size)
        return logfs_truncate_direct(inode, size);
 }
-int logfs_truncate(struct inode *inode, u64 size)
+/*
+ * Truncate, by changing the segment file, can consume a fair amount
+ * of resources.  So back off from time to time and do some GC.
+ * 8 or 2048 blocks should be well within safety limits even if
+ * every single block resided in a different segment.
+ */
+#define TRUNCATE_STEP   (8 * 1024 * 1024)
+int logfs_truncate(struct inode *inode, u64 target)
 {
        struct super_block *sb = inode->i_sb;
-        int err;
+        u64 size = i_size_read(inode);
+        int err = 0;
-        logfs_get_wblocks(sb, NULL, 1);
+        size = ALIGN(size, TRUNCATE_STEP);
-        err = __logfs_truncate(inode, size);
+        while (size > target) {
-        if (!err)
+                if (size > TRUNCATE_STEP)
-                err = __logfs_write_inode(inode, 0);
+                        size -= TRUNCATE_STEP;
-        logfs_put_wblocks(sb, NULL, 1);
+                else
+                        size = 0;
+                if (size < target)
+                        size = target;
+                logfs_get_wblocks(sb, NULL, 1);
+                err = __logfs_truncate(inode, size);
+                if (!err)
+                        err = __logfs_write_inode(inode, 0);
+                logfs_put_wblocks(sb, NULL, 1);
+        }
        if (!err)
-                err = vmtruncate(inode, size);
+                err = vmtruncate(inode, target);
        /* I don't trust error recovery yet. */
        WARN_ON(err);
@@ -2226,6 +2259,7 @@ int logfs_init_rw(struct super_block *sb)
        int min_fill = 3 * super->s_no_blocks;
        INIT_LIST_HEAD(&super->s_object_alias);
+        INIT_LIST_HEAD(&super->s_writeback_list);
        mutex_init(&super->s_write_mutex);
        super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
                        sizeof(struct logfs_block));
@@ -2239,8 +2273,6 @@ void logfs_cleanup_rw(struct super_block *sb)
        struct logfs_super *super = logfs_super(sb);
        destroy_meta_inode(super->s_segfile_inode);
-        if (super->s_block_pool)
+        logfs_mempool_destroy(super->s_block_pool);
-                mempool_destroy(super->s_block_pool);
+        logfs_mempool_destroy(super->s_shadow_pool);
-        if (super->s_shadow_pool)
-                mempool_destroy(super->s_shadow_pool);
 }
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 1a14f9910d55..a9657afb70ad 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -10,6 +10,7 @@
 * three kinds of objects: inodes, dentries and blocks, both data and indirect.
 */
 #include "logfs.h"
+#include <linux/slab.h>
 static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
 {
@@ -66,7 +67,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
        return page;
 }
-void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
                int use_filler)
 {
        pgoff_t index = ofs >> PAGE_SHIFT;
@@ -80,8 +81,10 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
                copylen = min((ulong)len, PAGE_SIZE - offset);
                page = get_mapping_page(area->a_sb, index, use_filler);
-                SetPageUptodate(page);
+                if (IS_ERR(page))
+                        return PTR_ERR(page);
                BUG_ON(!page); /* FIXME: reserve a pool */
+                SetPageUptodate(page);
                memcpy(page_address(page) + offset, buf, copylen);
                SetPagePrivate(page);
                page_cache_release(page);
@@ -91,52 +94,61 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
                offset = 0;
                index++;
        } while (len);
+        return 0;
 }
-/*
+static void pad_partial_page(struct logfs_area *area)
- * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
- */
-static void pad_wbuf(struct logfs_area *area, int final)
 {
        struct super_block *sb = area->a_sb;
-        struct logfs_super *super = logfs_super(sb);
        struct page *page;
        u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
        pgoff_t index = ofs >> PAGE_SHIFT;
        long offset = ofs & (PAGE_SIZE-1);
        u32 len = PAGE_SIZE - offset;
-        if (len == PAGE_SIZE) {
+        if (len % PAGE_SIZE) {
-                /* The math in this function can surely use some love */
+                page = get_mapping_page(sb, index, 0);
-                len = 0;
-        }
-        if (len) {
-                BUG_ON(area->a_used_bytes >= super->s_segsize);
-                page = get_mapping_page(area->a_sb, index, 0);
                BUG_ON(!page); /* FIXME: reserve a pool */
                memset(page_address(page) + offset, 0xff, len);
                SetPagePrivate(page);
                page_cache_release(page);
        }
+}
-        if (!final)
+static void pad_full_pages(struct logfs_area *area)
-                return;
+{
+        struct super_block *sb = area->a_sb;
+        struct logfs_super *super = logfs_super(sb);
+        u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+        u32 len = super->s_segsize - area->a_used_bytes;
+        pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
+        pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
+        struct page *page;
-        area->a_used_bytes += len;
+        while (no_indizes) {
-        for ( ; area->a_used_bytes < super->s_segsize;
+                page = get_mapping_page(sb, index, 0);
-                        area->a_used_bytes += PAGE_SIZE) {
-                /* Memset another page */
-                index++;
-                page = get_mapping_page(area->a_sb, index, 0);
                BUG_ON(!page); /* FIXME: reserve a pool */
-                memset(page_address(page), 0xff, PAGE_SIZE);
+                SetPageUptodate(page);
+                memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
                SetPagePrivate(page);
                page_cache_release(page);
+                index++;
+                no_indizes--;
        }
 }
 /*
+ * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
+ * Also make sure we allocate (and memset) all pages for final writeout.
+ */
+static void pad_wbuf(struct logfs_area *area, int final)
+{
+        pad_partial_page(area);
+        if (final)
+                pad_full_pages(area);
+}
+/*
 * We have to be careful with the alias tree.  Since lookup is done by bix,
 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
 * indirect blocks.  So always use it through accessor functions.
@@ -174,14 +186,8 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
        return 0;
 }
-static gc_level_t btree_block_level(struct logfs_block *block)
-{
-        return expand_level(block->ino, block->level);
-}
 static struct logfs_block_ops btree_block_ops = {
        .write_block    = btree_write_block,
-        .block_level    = btree_block_level,
        .free_block     = __free_block,
        .write_alias    = btree_write_alias,
 };
@@ -683,7 +689,7 @@ int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
        return 0;
 }
-static void freeseg(struct super_block *sb, u32 segno)
+void freeseg(struct super_block *sb, u32 segno)
 {
        struct logfs_super *super = logfs_super(sb);
        struct address_space *mapping = super->s_mapping_inode->i_mapping;
@@ -910,7 +916,7 @@ err:
        for (i--; i >= 0; i--)
                free_area(super->s_area[i]);
        free_area(super->s_journal_area);
-        mempool_destroy(super->s_alias_pool);
+        logfs_mempool_destroy(super->s_alias_pool);
        return -ENOMEM;
 }
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index c66beab78dee..d651e10a1e9c 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -11,6 +11,8 @@
 */
 #include "logfs.h"
 #include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
 #include <linux/mtd/mtd.h>
 #include <linux/statfs.h>
 #include <linux/buffer_head.h>
@@ -136,6 +138,14 @@ static int logfs_sb_set(struct super_block *sb, void *_super)
        sb->s_fs_info = super;
        sb->s_mtd = super->s_mtd;
        sb->s_bdev = super->s_bdev;
+#ifdef CONFIG_BLOCK
+        if (sb->s_bdev)
+                sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
+#endif
+#ifdef CONFIG_MTD
+        if (sb->s_mtd)
+                sb->s_bdi = sb->s_mtd->backing_dev_info;
+#endif
        return 0;
 }
@@ -277,7 +287,7 @@ static int logfs_recover_sb(struct super_block *sb)
        }
        if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
                printk(KERN_INFO"Superblocks don't match - fixing.\n");
-                return write_one_sb(sb, super->s_devops->find_last_sb);
+                return logfs_write_sb(sb);
        }
        /* If neither is valid now, something's wrong.  Didn't we properly
         * check them before?!? */
@@ -289,6 +299,10 @@ static int logfs_make_writeable(struct super_block *sb)
 {
        int err;
+        err = logfs_open_segfile(sb);
+        if (err)
+                return err;
        /* Repair any broken superblock copies */
        err = logfs_recover_sb(sb);
        if (err)
@@ -299,10 +313,6 @@ static int logfs_make_writeable(struct super_block *sb)
        if (err)
                return err;
-        err = logfs_open_segfile(sb);
-        if (err)
-                return err;
        /* Do one GC pass before any data gets dirtied */
        logfs_gc_pass(sb);
@@ -327,27 +337,27 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
                goto fail;
        sb->s_root = d_alloc_root(rootdir);
-        if (!sb->s_root)
+        if (!sb->s_root) {
+                iput(rootdir);
                goto fail;
+        }
        super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
        if (!super->s_erase_page)
-                goto fail2;
+                goto fail;
        memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
        /* FIXME: check for read-only mounts */
        err = logfs_make_writeable(sb);
        if (err)
-                goto fail3;
+                goto fail1;
        log_super("LogFS: Finished mounting\n");
        simple_set_mnt(mnt, sb);
        return 0;
-fail3:
+fail1:
        __free_page(super->s_erase_page);
-fail2:
-        iput(rootdir);
 fail:
        iput(logfs_super(sb)->s_master_inode);
        return -EIO;
@@ -376,7 +386,7 @@ static struct page *find_super_block(struct super_block *sb)
        if (!first || IS_ERR(first))
                return NULL;
        last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
-        if (!last || IS_ERR(first)) {
+        if (!last || IS_ERR(last)) {
                page_cache_release(first);
                return NULL;
        }
@@ -407,7 +417,7 @@ static int __logfs_read_sb(struct super_block *sb)
        page = find_super_block(sb);
        if (!page)
-                return -EIO;
+                return -EINVAL;
        ds = page_address(page);
        super->s_size = be64_to_cpu(ds->ds_filesystem_size);
@@ -451,6 +461,8 @@ static int logfs_read_sb(struct super_block *sb, int read_only)
        btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
        btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
+        btree_init_mempool32(&super->s_shadow_tree.segment_map,
+                        super->s_btree_pool);
        ret = logfs_init_mapping(sb);
        if (ret)
@@ -515,8 +527,8 @@ static void logfs_kill_sb(struct super_block *sb)
        if (super->s_erase_page)
                __free_page(super->s_erase_page);
        super->s_devops->put_device(sb);
-        mempool_destroy(super->s_btree_pool);
+        logfs_mempool_destroy(super->s_btree_pool);
-        mempool_destroy(super->s_alias_pool);
+        logfs_mempool_destroy(super->s_alias_pool);
        kfree(super);
        log_super("LogFS: Finished unmounting\n");
 }
@@ -572,8 +584,7 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
        return 0;
 err1:
-        up_write(&sb->s_umount);
+        deactivate_locked_super(sb);
-        deactivate_super(sb);
        return err;
 err0:
        kfree(super);
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 82d6554b02fe..282e15ad8cd8 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -1,4 +1,5 @@
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include "minix.h"
 enum {DEPTH = 3, DIRECT = 7};   /* Only double indirect */
diff --git a/fs/mpage.c b/fs/mpage.c
index 42381bd6543b..fd56ca2ea556 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/kdev_t.h>
+#include <linux/gfp.h>
 #include <linux/bio.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
@@ -561,7 +562,7 @@ page_is_mapped:
        if (page->index >= end_index) {
                /*
                 * The page straddles i_size.  It must be zeroed out on each
-                 * and every writepage invokation because it may be mmapped.
+                 * and every writepage invocation because it may be mmapped.
                 * "A file is mapped in multiples of the page size.  For a file
                 * that is not a multiple of the page size, the remaining memory
                 * is zeroed when mapped, and writes to that region are not
diff --git a/fs/namei.c b/fs/namei.c
index 48e60a187325..b86b96fe1dc3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1610,8 +1610,7 @@ exit:
 static struct file *do_last(struct nameidata *nd, struct path *path,
                            int open_flag, int acc_mode,
-                            int mode, const char *pathname,
+                            int mode, const char *pathname)
-                            int *want_dir)
 {
        struct dentry *dir = nd->path.dentry;
        struct file *filp;
@@ -1642,7 +1641,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (nd->last.name[nd->last.len]) {
                if (open_flag & O_CREAT)
                        goto exit;
-                *want_dir = 1;
+                nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
        }
        /* just plain open? */
@@ -1656,8 +1655,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                if (path->dentry->d_inode->i_op->follow_link)
                        return NULL;
                error = -ENOTDIR;
-                if (*want_dir && !path->dentry->d_inode->i_op->lookup)
+                if (nd->flags & LOOKUP_DIRECTORY) {
-                        goto exit_dput;
+                        if (!path->dentry->d_inode->i_op->lookup)
+                                goto exit_dput;
+                }
                path_to_nameidata(path, nd);
                audit_inode(pathname, nd->path.dentry);
                goto ok;
@@ -1766,7 +1767,6 @@ struct file *do_filp_open(int dfd, const char *pathname,
        int count = 0;
        int flag = open_to_namei_flags(open_flag);
        int force_reval = 0;
-        int want_dir = open_flag & O_DIRECTORY;
        if (!(open_flag & O_CREAT))
                mode = 0;
@@ -1828,14 +1828,18 @@ reval:
                if (open_flag & O_EXCL)
                        nd.flags |= LOOKUP_EXCL;
        }
-        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
+        if (open_flag & O_DIRECTORY)
+                nd.flags |= LOOKUP_DIRECTORY;
+        if (!(open_flag & O_NOFOLLOW))
+                nd.flags |= LOOKUP_FOLLOW;
+        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
                struct path holder;
                struct inode *inode = path.dentry->d_inode;
                void *cookie;
                error = -ELOOP;
                /* S_ISDIR part is a temporary automount kludge */
-                if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode))
+                if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
                        goto exit_dput;
                if (count++ == 32)
                        goto exit_dput;
@@ -1866,7 +1870,7 @@ reval:
                }
                holder = path;
                nd.flags &= ~LOOKUP_PARENT;
-                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
+                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
                if (inode->i_op->put_link)
                        inode->i_op->put_link(holder.dentry, &nd, cookie);
                path_put(&holder);
@@ -2172,8 +2176,10 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
                error = security_inode_rmdir(dir, dentry);
                if (!error) {
                        error = dir->i_op->rmdir(dir, dentry);
-                        if (!error)
+                        if (!error) {
                                dentry->d_inode->i_flags |= S_DEAD;
+                                dont_mount(dentry);
+                        }
                }
        }
        mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2257,7 +2263,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
                if (!error) {
                        error = dir->i_op->unlink(dir, dentry);
                        if (!error)
-                                dentry->d_inode->i_flags |= S_DEAD;
+                                dont_mount(dentry);
                }
        }
        mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2544,7 +2550,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
 *      e) conversion from fhandle to dentry may come in the wrong moment - when
 *         we are removing the target. Solution: we will have to grab ->i_mutex
 *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
- *         ->i_mutex on parents, which works but leads to some truely excessive
+ *         ->i_mutex on parents, which works but leads to some truly excessive
 *         locking].
 */
 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
@@ -2568,17 +2574,20 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
                return error;
        target = new_dentry->d_inode;
-        if (target) {
+        if (target)
                mutex_lock(&target->i_mutex);
-                dentry_unhash(new_dentry);
-        }
        if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
                error = -EBUSY;
-        else 
+        else {
+                if (target)
+                        dentry_unhash(new_dentry);
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+        }
        if (target) {
-                if (!error)
+                if (!error) {
                        target->i_flags |= S_DEAD;
+                        dont_mount(new_dentry);
+                }
                mutex_unlock(&target->i_mutex);
                if (d_unhashed(new_dentry))
                        d_rehash(new_dentry);
@@ -2610,7 +2619,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        if (!error) {
                if (target)
-                        target->i_flags |= S_DEAD;
+                        dont_mount(new_dentry);
                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                        d_move(old_dentry, new_dentry);
        }
diff --git a/fs/namespace.c b/fs/namespace.c
index 8174c8ab5c70..88058de59c7c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -628,7 +628,6 @@ repeat:
                mnt->mnt_pinned = 0;
                spin_unlock(&vfsmount_lock);
                acct_auto_close_mnt(mnt);
-                security_sb_umount_close(mnt);
                goto repeat;
        }
 }
@@ -1117,8 +1116,6 @@ static int do_umount(struct vfsmount *mnt, int flags)
                retval = 0;
        }
        spin_unlock(&vfsmount_lock);
-        if (retval)
-                security_sb_umount_busy(mnt);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        return retval;
@@ -1432,20 +1429,13 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
        err = -ENOENT;
        mutex_lock(&path->dentry->d_inode->i_mutex);
-        if (IS_DEADDIR(path->dentry->d_inode))
+        if (cant_mount(path->dentry))
-                goto out_unlock;
-        err = security_sb_check_sb(mnt, path);
-        if (err)
                goto out_unlock;
-        err = -ENOENT;
        if (!d_unlinked(path->dentry))
                err = attach_recursive_mnt(mnt, path, NULL);
 out_unlock:
        mutex_unlock(&path->dentry->d_inode->i_mutex);
-        if (!err)
-                security_sb_post_addmount(mnt, path);
        return err;
 }
@@ -1581,8 +1571,6 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
        }
        up_write(&sb->s_umount);
        if (!err) {
-                security_sb_post_remount(path->mnt, flags, data);
                spin_lock(&vfsmount_lock);
                touch_mnt_namespace(path->mnt->mnt_ns);
                spin_unlock(&vfsmount_lock);
@@ -1623,7 +1611,7 @@ static int do_move_mount(struct path *path, char *old_name)
        err = -ENOENT;
        mutex_lock(&path->dentry->d_inode->i_mutex);
-        if (IS_DEADDIR(path->dentry->d_inode))
+        if (cant_mount(path->dentry))
                goto out1;
        if (d_unlinked(path->dentry))
@@ -2234,7 +2222,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        if (!check_mnt(root.mnt))
                goto out2;
        error = -ENOENT;
-        if (IS_DEADDIR(new.dentry->d_inode))
+        if (cant_mount(old.dentry))
                goto out2;
        if (d_unlinked(new.dentry))
                goto out2;
@@ -2277,7 +2265,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        spin_unlock(&vfsmount_lock);
        chroot_fs_refs(&root, &new);
-        security_sb_post_pivotroot(&root, &new);
        error = 0;
        path_put(&root_parent);
        path_put(&parent_path);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b8b5b30d53f0..7edfcd4d5e52 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -15,7 +15,6 @@
 #include <linux/errno.h>
 #include <linux/stat.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <asm/uaccess.h>
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6a7d901f1936..1daabb90e0a5 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -15,7 +15,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index cf98da1be23e..fa3385154023 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -526,10 +526,15 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        sb->s_blocksize_bits = 10;
        sb->s_magic = NCP_SUPER_MAGIC;
        sb->s_op = &ncp_sops;
+        sb->s_bdi = &server->bdi;
        server = NCP_SBP(sb);
        memset(server, 0, sizeof(*server));
+        error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
+        if (error)
+                goto out_bdi;
        server->ncp_filp = ncp_filp;
        server->ncp_sock = sock;
        
@@ -719,6 +724,8 @@ out_fput2:
        if (server->info_filp)
                fput(server->info_filp);
 out_fput:
+        bdi_destroy(&server->bdi);
+out_bdi:
        /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
         * 
         * The previously used put_filp(ncp_filp); was bogous, since
@@ -756,6 +763,7 @@ static void ncp_put_super(struct super_block *sb)
        kill_pid(server->m.wdog_pid, SIGTERM, 1);
        put_pid(server->m.wdog_pid);
+        bdi_destroy(&server->bdi);
        kfree(server->priv.data);
        kfree(server->auth.object_name);
        vfree(server->rxbuf);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index ec8f45f12e05..60a5e2864ea8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 15458decdb8a..56f5b3a0e1ee 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -9,12 +9,12 @@
 #include <linux/stat.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/shm.h>
 #include <linux/errno.h>
 #include <linux/mman.h>
 #include <linux/string.h>
-#include <linux/slab.h>
 #include <linux/fcntl.h>
 #include <linux/ncp_fs.h>
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index e37df8d5fe70..c7ff6c700a6e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/netdevice.h>
 #include <linux/signal.h>
+#include <linux/slab.h>
 #include <net/scm.h>
 #include <net/sock.h>
 #include <linux/ipx.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e3d26c1bd105..c634fd17b337 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -27,6 +27,7 @@
 #include <linux/fs.h>
 #include <linux/ncp_fs.h>
 #include <linux/time.h>
+#include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include "ncplib_kernel.h"
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index b4ffd0146ea6..84690319e625 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -10,6 +10,7 @@
 #include <linux/moduleparam.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 84761b5bb8e2..a08770a7e857 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -7,6 +7,7 @@
 */
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index db30c0b398b5..05af212f0edf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include "nfs4_fs.h"
 #include "callback.h"
@@ -782,6 +783,7 @@ struct svc_version nfs4_callback_version1 = {
        .vs_proc = nfs4_callback_procedures1,
        .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
        .vs_dispatch = NULL,
+        .vs_hidden = 1,
 };
 struct svc_version nfs4_callback_version4 = {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2274f1737336..7ec9b34a59f8 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -35,6 +35,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
 #include <linux/sunrpc/bc_xprt.h>
@@ -933,7 +934,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
        }
        fsinfo.fattr = fattr;
-        nfs_fattr_init(fattr);
        error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
        if (error < 0)
                goto out_error;
@@ -965,6 +965,8 @@ out_error:
 static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
 {
        target->flags = source->flags;
+        target->rsize = source->rsize;
+        target->wsize = source->wsize;
        target->acregmin = source->acregmin;
        target->acregmax = source->acregmax;
        target->acdirmin = source->acdirmin;
@@ -1044,13 +1046,18 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
                                     struct nfs_fh *mntfh)
 {
        struct nfs_server *server;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
        int error;
        server = nfs_alloc_server();
        if (!server)
                return ERR_PTR(-ENOMEM);
+        error = -ENOMEM;
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto error;
        /* Get a client representation */
        error = nfs_init_server(server, data);
        if (error < 0)
@@ -1061,7 +1068,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
        /* Probe the root fh to retrieve its FSID */
-        error = nfs_probe_fsinfo(server, mntfh, &fattr);
+        error = nfs_probe_fsinfo(server, mntfh, fattr);
        if (error < 0)
                goto error;
        if (server->nfs_client->rpc_ops->version == 3) {
@@ -1074,14 +1081,14 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
                        server->namelen = NFS2_MAXNAMLEN;
        }
-        if (!(fattr.valid & NFS_ATTR_FATTR)) {
+        if (!(fattr->valid & NFS_ATTR_FATTR)) {
-                error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+                error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
                if (error < 0) {
                        dprintk("nfs_create_server: getattr error = %d\n", -error);
                        goto error;
                }
        }
-        memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
+        memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
        dprintk("Server FSID: %llx:%llx\n",
                (unsigned long long) server->fsid.major,
@@ -1093,9 +1100,11 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
+        nfs_free_fattr(fattr);
        return server;
 error:
+        nfs_free_fattr(fattr);
        nfs_free_server(server);
        return ERR_PTR(error);
 }
@@ -1293,7 +1302,8 @@ static int nfs4_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
-        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
+        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
+                NFS_CAP_POSIX_LOCK;
        server->options = data->options;
        /* Get a client record */
@@ -1336,7 +1346,7 @@ error:
 struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
                                      struct nfs_fh *mntfh)
 {
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
        struct nfs_server *server;
        int error;
@@ -1346,6 +1356,11 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
        if (!server)
                return ERR_PTR(-ENOMEM);
+        error = -ENOMEM;
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto error;
        /* set up the general RPC client */
        error = nfs4_init_server(server, data);
        if (error < 0)
@@ -1360,7 +1375,7 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
                goto error;
        /* Probe the root fh to retrieve its FSID */
-        error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
+        error = nfs4_get_rootfh(server, mntfh);
        if (error < 0)
                goto error;
@@ -1371,7 +1386,7 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
        nfs4_session_set_rwsize(server);
-        error = nfs_probe_fsinfo(server, mntfh, &fattr);
+        error = nfs_probe_fsinfo(server, mntfh, fattr);
        if (error < 0)
                goto error;
@@ -1385,9 +1400,11 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
        server->mount_time = jiffies;
        dprintk("<-- nfs4_create_server() = %p\n", server);
+        nfs_free_fattr(fattr);
        return server;
 error:
+        nfs_free_fattr(fattr);
        nfs_free_server(server);
        dprintk("<-- nfs4_create_server() = error %d\n", error);
        return ERR_PTR(error);
@@ -1401,7 +1418,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 {
        struct nfs_client *parent_client;
        struct nfs_server *server, *parent_server;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
        int error;
        dprintk("--> nfs4_create_referral_server()\n");
@@ -1410,6 +1427,11 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        if (!server)
                return ERR_PTR(-ENOMEM);
+        error = -ENOMEM;
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto error;
        parent_server = NFS_SB(data->sb);
        parent_client = parent_server->nfs_client;
@@ -1439,12 +1461,12 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
        /* Probe the root fh to retrieve its FSID and filehandle */
-        error = nfs4_path_walk(server, mntfh, data->mnt_path);
+        error = nfs4_get_rootfh(server, mntfh);
        if (error < 0)
                goto error;
        /* probe the filesystem info for this server filesystem */
-        error = nfs_probe_fsinfo(server, mntfh, &fattr);
+        error = nfs_probe_fsinfo(server, mntfh, fattr);
        if (error < 0)
                goto error;
@@ -1462,10 +1484,12 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        server->mount_time = jiffies;
+        nfs_free_fattr(fattr);
        dprintk("<-- nfs_create_referral_server() = %p\n", server);
        return server;
 error:
+        nfs_free_fattr(fattr);
        nfs_free_server(server);
        dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
        return ERR_PTR(error);
@@ -1481,7 +1505,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
                                    struct nfs_fattr *fattr)
 {
        struct nfs_server *server;
-        struct nfs_fattr fattr_fsinfo;
+        struct nfs_fattr *fattr_fsinfo;
        int error;
        dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
@@ -1492,6 +1516,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
        if (!server)
                return ERR_PTR(-ENOMEM);
+        error = -ENOMEM;
+        fattr_fsinfo = nfs_alloc_fattr();
+        if (fattr_fsinfo == NULL)
+                goto out_free_server;
        /* Copy data from the source */
        server->nfs_client = source->nfs_client;
        atomic_inc(&server->nfs_client->cl_count);
@@ -1508,7 +1537,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
                nfs_init_server_aclclient(server);
        /* probe the filesystem info for this server filesystem */
-        error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo);
+        error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
        if (error < 0)
                goto out_free_server;
@@ -1530,10 +1559,12 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
        server->mount_time = jiffies;
+        nfs_free_fattr(fattr_fsinfo);
        dprintk("<-- nfs_clone_server() = %p\n", server);
        return server;
 out_free_server:
+        nfs_free_fattr(fattr_fsinfo);
        nfs_free_server(server);
        dprintk("<-- nfs_clone_server() = error %d\n", error);
        return ERR_PTR(error);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 2563bebc4c67..301634543974 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/spinlock.h>
@@ -23,6 +24,8 @@
 static void nfs_do_free_delegation(struct nfs_delegation *delegation)
 {
+        if (delegation->cred)
+                put_rpccred(delegation->cred);
        kfree(delegation);
 }
@@ -35,13 +38,7 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
-        struct rpc_cred *cred;
-        cred = rcu_dereference(delegation->cred);
-        rcu_assign_pointer(delegation->cred, NULL);
        call_rcu(&delegation->rcu, nfs_free_delegation_callback);
-        if (cred)
-                put_rpccred(cred);
 }
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
@@ -128,21 +125,35 @@ again:
 */
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-        struct nfs_delegation *delegation = NFS_I(inode)->delegation;
+        struct nfs_delegation *delegation;
-        struct rpc_cred *oldcred;
+        struct rpc_cred *oldcred = NULL;
-        if (delegation == NULL)
+        rcu_read_lock();
-                return;
+        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        memcpy(delegation->stateid.data, res->delegation.data,
+        if (delegation != NULL) {
-                        sizeof(delegation->stateid.data));
+                spin_lock(&delegation->lock);
-        delegation->type = res->delegation_type;
+                if (delegation->inode != NULL) {
-        delegation->maxsize = res->maxsize;
+                        memcpy(delegation->stateid.data, res->delegation.data,
-        oldcred = delegation->cred;
+                               sizeof(delegation->stateid.data));
-        delegation->cred = get_rpccred(cred);
+                        delegation->type = res->delegation_type;
-        clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+                        delegation->maxsize = res->maxsize;
-        NFS_I(inode)->delegation_state = delegation->type;
+                        oldcred = delegation->cred;
-        smp_wmb();
+                        delegation->cred = get_rpccred(cred);
-        put_rpccred(oldcred);
+                        clear_bit(NFS_DELEGATION_NEED_RECLAIM,
+                                  &delegation->flags);
+                        NFS_I(inode)->delegation_state = delegation->type;
+                        spin_unlock(&delegation->lock);
+                        put_rpccred(oldcred);
+                        rcu_read_unlock();
+                } else {
+                        /* We appear to have raced with a delegation return. */
+                        spin_unlock(&delegation->lock);
+                        rcu_read_unlock();
+                        nfs_inode_set_delegation(inode, cred, res);
+                }
+        } else {
+                rcu_read_unlock();
+        }
 }
 static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
@@ -165,9 +176,13 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
        return inode;
 }
-static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
+static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+                                                           const nfs4_stateid *stateid,
+                                                           struct nfs_client *clp)
 {
-        struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+        struct nfs_delegation *delegation =
+                rcu_dereference_protected(nfsi->delegation,
+                                          lockdep_is_held(&clp->cl_lock));
        if (delegation == NULL)
                goto nomatch;
@@ -194,11 +209,11 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 {
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
-        struct nfs_delegation *delegation;
+        struct nfs_delegation *delegation, *old_delegation;
        struct nfs_delegation *freeme = NULL;
        int status = 0;
-        delegation = kmalloc(sizeof(*delegation), GFP_KERNEL);
+        delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
        if (delegation == NULL)
                return -ENOMEM;
        memcpy(delegation->stateid.data, res->delegation.data,
@@ -212,10 +227,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        spin_lock_init(&delegation->lock);
        spin_lock(&clp->cl_lock);
-        if (rcu_dereference(nfsi->delegation) != NULL) {
+        old_delegation = rcu_dereference_protected(nfsi->delegation,
-                if (memcmp(&delegation->stateid, &nfsi->delegation->stateid,
+                                                   lockdep_is_held(&clp->cl_lock));
-                                        sizeof(delegation->stateid)) == 0 &&
+        if (old_delegation != NULL) {
-                                delegation->type == nfsi->delegation->type) {
+                if (memcmp(&delegation->stateid, &old_delegation->stateid,
+                                        sizeof(old_delegation->stateid)) == 0 &&
+                                delegation->type == old_delegation->type) {
                        goto out;
                }
                /*
@@ -225,12 +242,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                dfprintk(FILE, "%s: server %s handed out "
                                "a duplicate delegation!\n",
                                __func__, clp->cl_hostname);
-                if (delegation->type <= nfsi->delegation->type) {
+                if (delegation->type <= old_delegation->type) {
                        freeme = delegation;
                        delegation = NULL;
                        goto out;
                }
-                freeme = nfs_detach_delegation_locked(nfsi, NULL);
+                freeme = nfs_detach_delegation_locked(nfsi, NULL, clp);
        }
        list_add_rcu(&delegation->super_list, &clp->cl_delegations);
        nfsi->delegation_state = delegation->type;
@@ -300,7 +317,7 @@ restart:
                if (inode == NULL)
                        continue;
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
                spin_unlock(&clp->cl_lock);
                rcu_read_unlock();
                if (delegation != NULL) {
@@ -329,9 +346,9 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
-        if (rcu_dereference(nfsi->delegation) != NULL) {
+        if (rcu_access_pointer(nfsi->delegation) != NULL) {
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL);
+                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
                spin_unlock(&clp->cl_lock);
                if (delegation != NULL)
                        nfs_do_return_delegation(inode, delegation, 0);
@@ -345,9 +362,9 @@ int nfs_inode_return_delegation(struct inode *inode)
        struct nfs_delegation *delegation;
        int err = 0;
-        if (rcu_dereference(nfsi->delegation) != NULL) {
+        if (rcu_access_pointer(nfsi->delegation) != NULL) {
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL);
+                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
                spin_unlock(&clp->cl_lock);
                if (delegation != NULL) {
                        nfs_msync_inode(inode);
@@ -525,7 +542,7 @@ restart:
                if (inode == NULL)
                        continue;
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
                spin_unlock(&clp->cl_lock);
                rcu_read_unlock();
                if (delegation != NULL)
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 944b627ec6e1..69e7b8140122 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -71,4 +71,10 @@ static inline int nfs_inode_return_delegation(struct inode *inode)
 }
 #endif
+static inline int nfs_have_delegated_attributes(struct inode *inode)
+{
+        return nfs_have_delegation(inode, FMODE_READ) &&
+                !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
+}
 #endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a1f6b4438fb1..ee9a179ebdf3 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -530,9 +530,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        nfs_readdir_descriptor_t my_desc,
                        *desc = &my_desc;
        struct nfs_entry my_entry;
-        struct nfs_fh    fh;
+        int res = -ENOMEM;
-        struct nfs_fattr fattr;
-        long            res;
        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -554,9 +552,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        my_entry.cookie = my_entry.prev_cookie = 0;
        my_entry.eof = 0;
-        my_entry.fh = &fh;
+        my_entry.fh = nfs_alloc_fhandle();
-        my_entry.fattr = &fattr;
+        my_entry.fattr = nfs_alloc_fattr();
-        nfs_fattr_init(&fattr);
+        if (my_entry.fh == NULL || my_entry.fattr == NULL)
+                goto out_alloc_failed;
        desc->entry = &my_entry;
        nfs_block_sillyrename(dentry);
@@ -598,7 +598,10 @@ out:
        nfs_unblock_sillyrename(dentry);
        if (res > 0)
                res = 0;
-        dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
+out_alloc_failed:
+        nfs_free_fattr(my_entry.fattr);
+        nfs_free_fhandle(my_entry.fh);
+        dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        res);
        return res;
@@ -776,9 +779,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        struct inode *dir;
        struct inode *inode;
        struct dentry *parent;
+        struct nfs_fh *fhandle = NULL;
+        struct nfs_fattr *fattr = NULL;
        int error;
-        struct nfs_fh fhandle;
-        struct nfs_fattr fattr;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
@@ -811,14 +814,22 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        if (NFS_STALE(inode))
                goto out_bad;
-        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
+        error = -ENOMEM;
+        fhandle = nfs_alloc_fhandle();
+        fattr = nfs_alloc_fattr();
+        if (fhandle == NULL || fattr == NULL)
+                goto out_error;
+        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
        if (error)
                goto out_bad;
-        if (nfs_compare_fh(NFS_FH(inode), &fhandle))
+        if (nfs_compare_fh(NFS_FH(inode), fhandle))
                goto out_bad;
-        if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
+        if ((error = nfs_refresh_inode(inode, fattr)) != 0)
                goto out_bad;
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
 out_set_verifier:
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_valid:
@@ -837,14 +848,26 @@ out_zap_parent:
                /* If we have submounts, don't unhash ! */
                if (have_submounts(dentry))
                        goto out_valid;
+                if (dentry->d_flags & DCACHE_DISCONNECTED)
+                        goto out_valid;
                shrink_dcache_parent(dentry);
        }
        d_drop(dentry);
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
        dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
                        __func__, dentry->d_parent->d_name.name,
                        dentry->d_name.name);
        return 0;
+out_error:
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
+        dput(parent);
+        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
+                        __func__, dentry->d_parent->d_name.name,
+                        dentry->d_name.name, error);
+        return error;
 }
 /*
@@ -909,9 +932,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        struct dentry *res;
        struct dentry *parent;
        struct inode *inode = NULL;
+        struct nfs_fh *fhandle = NULL;
+        struct nfs_fattr *fattr = NULL;
        int error;
-        struct nfs_fh fhandle;
-        struct nfs_fattr fattr;
        dfprintk(VFS, "NFS: lookup(%s/%s)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -921,7 +944,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
                goto out;
-        res = ERR_PTR(-ENOMEM);
        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
        /*
@@ -934,17 +956,23 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
                goto out;
        }
+        res = ERR_PTR(-ENOMEM);
+        fhandle = nfs_alloc_fhandle();
+        fattr = nfs_alloc_fattr();
+        if (fhandle == NULL || fattr == NULL)
+                goto out;
        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
        nfs_block_sillyrename(parent);
-        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
+        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
        if (error == -ENOENT)
                goto no_entry;
        if (error < 0) {
                res = ERR_PTR(error);
                goto out_unblock_sillyrename;
        }
-        inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
+        inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
        res = (struct dentry *)inode;
        if (IS_ERR(res))
                goto out_unblock_sillyrename;
@@ -960,6 +988,8 @@ no_entry:
 out_unblock_sillyrename:
        nfs_unblock_sillyrename(parent);
 out:
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
        return res;
 }
@@ -1025,12 +1055,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
+                        case -EISDIR:
                        case -ENOTDIR:
                                goto no_open;
                        case -ELOOP:
                                if (!(nd->intent.open.flags & O_NOFOLLOW))
                                        goto no_open;
-                        /* case -EISDIR: */
                        /* case -EINVAL: */
                        default:
                                goto out;
@@ -1050,7 +1080,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        struct inode *dir;
        int openflags, ret = 0;
-        if (!is_atomic_open(nd))
+        if (!is_atomic_open(nd) || d_mountpoint(dentry))
                goto no_open;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
@@ -1667,28 +1697,33 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry)
        smp_mb__after_atomic_dec();
 }
+static void nfs_access_free_list(struct list_head *head)
+{
+        struct nfs_access_entry *cache;
+        while (!list_empty(head)) {
+                cache = list_entry(head->next, struct nfs_access_entry, lru);
+                list_del(&cache->lru);
+                nfs_access_free_entry(cache);
+        }
+}
 int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
 {
        LIST_HEAD(head);
        struct nfs_inode *nfsi;
        struct nfs_access_entry *cache;
-restart:
+        if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+                return (nr_to_scan == 0) ? 0 : -1;
        spin_lock(&nfs_access_lru_lock);
        list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
-                struct rw_semaphore *s_umount;
                struct inode *inode;
                if (nr_to_scan-- == 0)
                        break;
-                s_umount = &nfsi->vfs_inode.i_sb->s_umount;
+                inode = &nfsi->vfs_inode;
-                if (!down_read_trylock(s_umount))
-                        continue;
-                inode = igrab(&nfsi->vfs_inode);
-                if (inode == NULL) {
-                        up_read(s_umount);
-                        continue;
-                }
                spin_lock(&inode->i_lock);
                if (list_empty(&nfsi->access_cache_entry_lru))
                        goto remove_lru_entry;
@@ -1702,61 +1737,47 @@ restart:
                else {
 remove_lru_entry:
                        list_del_init(&nfsi->access_cache_inode_lru);
+                        smp_mb__before_clear_bit();
                        clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
+                        smp_mb__after_clear_bit();
                }
-                spin_unlock(&inode->i_lock);
-                spin_unlock(&nfs_access_lru_lock);
-                iput(inode);
-                up_read(s_umount);
-                goto restart;
        }
        spin_unlock(&nfs_access_lru_lock);
-        while (!list_empty(&head)) {
+        nfs_access_free_list(&head);
-                cache = list_entry(head.next, struct nfs_access_entry, lru);
-                list_del(&cache->lru);
-                nfs_access_free_entry(cache);
-        }
        return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
 }
-static void __nfs_access_zap_cache(struct inode *inode)
+static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
 {
-        struct nfs_inode *nfsi = NFS_I(inode);
        struct rb_root *root_node = &nfsi->access_cache;
-        struct rb_node *n, *dispose = NULL;
+        struct rb_node *n;
        struct nfs_access_entry *entry;
        /* Unhook entries from the cache */
        while ((n = rb_first(root_node)) != NULL) {
                entry = rb_entry(n, struct nfs_access_entry, rb_node);
                rb_erase(n, root_node);
-                list_del(&entry->lru);
+                list_move(&entry->lru, head);
-                n->rb_left = dispose;
-                dispose = n;
        }
        nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
-        spin_unlock(&inode->i_lock);
-        /* Now kill them all! */
-        while (dispose != NULL) {
-                n = dispose;
-                dispose = n->rb_left;
-                nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node));
-        }
 }
 void nfs_access_zap_cache(struct inode *inode)
 {
+        LIST_HEAD(head);
+        if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
+                return;
        /* Remove from global LRU init */
-        if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
+        spin_lock(&nfs_access_lru_lock);
-                spin_lock(&nfs_access_lru_lock);
+        if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
                list_del_init(&NFS_I(inode)->access_cache_inode_lru);
-                spin_unlock(&nfs_access_lru_lock);
-        }
        spin_lock(&inode->i_lock);
-        /* This will release the spinlock */
+        __nfs_access_zap_cache(NFS_I(inode), &head);
-        __nfs_access_zap_cache(inode);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&nfs_access_lru_lock);
+        nfs_access_free_list(&head);
 }
 static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
@@ -1789,7 +1810,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
        cache = nfs_access_search_rbtree(inode, cred);
        if (cache == NULL)
                goto out;
-        if (!nfs_have_delegation(inode, FMODE_READ) &&
+        if (!nfs_have_delegated_attributes(inode) &&
            !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
                goto out_stale;
        res->jiffies = cache->jiffies;
@@ -1807,8 +1828,8 @@ out_stale:
        nfs_access_free_entry(cache);
        return -ENOENT;
 out_zap:
-        /* This will release the spinlock */
+        spin_unlock(&inode->i_lock);
-        __nfs_access_zap_cache(inode);
+        nfs_access_zap_cache(inode);
        return -ENOENT;
 }
@@ -1863,9 +1884,11 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
        smp_mb__after_atomic_inc();
        /* Add inode to global LRU list */
-        if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
+        if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
                spin_lock(&nfs_access_lru_lock);
-                list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list);
+                if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+                        list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
+                                        &nfs_access_lru_list);
                spin_unlock(&nfs_access_lru_lock);
        }
 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0d289823e856..ad4cd31d6050 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -44,6 +44,7 @@
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/kref.h>
+#include <linux/slab.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 3f0cd4dfddaf..76fd235d0024 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -9,6 +9,7 @@
 #include <linux/hash.h>
 #include <linux/string.h>
 #include <linux/kmod.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/socket.h>
 #include <linux/seq_file.h>
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ae8d02294e46..cac96bcc91e4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -24,9 +24,9 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/aio.h>
+#include <linux/gfp.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -161,14 +161,17 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
-        if (server->flags & NFS_MOUNT_NOAC)
+        if (nfs_have_delegated_attributes(inode))
-                goto force_reval;
+                goto out_noreval;
        if (filp->f_flags & O_DIRECT)
                goto force_reval;
-        if (nfsi->npages != 0)
+        if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
-                return 0;
+                goto force_reval;
-        if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode))
+        if (nfs_attribute_timeout(inode))
-                return 0;
+                goto force_reval;
+out_noreval:
+        return 0;
 force_reval:
        return __nfs_revalidate_inode(server, inode);
 }
@@ -491,7 +494,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 {
        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
-        if (gfp & __GFP_WAIT)
+        /* Only do I/O if gfp is a superset of GFP_KERNEL */
+        if ((gfp & GFP_KERNEL) == GFP_KERNEL)
                nfs_wb_page(page->mapping->host, page);
        /* If PagePrivate() is set, then the page is not freeable */
        if (PagePrivate(page))
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 237874f1af23..ce153a6b3aec 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -17,6 +17,7 @@
 #include <linux/nfs_fs_sb.h>
 #include <linux/in6.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "internal.h"
 #include "iostat.h"
@@ -466,7 +467,8 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
                                 struct list_head *pages,
                                 unsigned *nr_pages)
 {
-        int ret, npages = *nr_pages;
+        unsigned npages = *nr_pages;
+        int ret;
        dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
                 NFS_I(inode)->fscache, npages, inode);
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b35d2a616066..7428f7d6273b 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -78,159 +78,94 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 {
        struct nfs_server *server = NFS_SB(sb);
        struct nfs_fsinfo fsinfo;
-        struct nfs_fattr fattr;
+        struct dentry *ret;
-        struct dentry *mntroot;
        struct inode *inode;
        int error;
        /* get the actual root for this mount */
-        fsinfo.fattr = &fattr;
+        fsinfo.fattr = nfs_alloc_fattr();
+        if (fsinfo.fattr == NULL)
+                return ERR_PTR(-ENOMEM);
        error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
        if (error < 0) {
                dprintk("nfs_get_root: getattr error = %d\n", -error);
-                return ERR_PTR(error);
+                ret = ERR_PTR(error);
+                goto out;
        }
        inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
        if (IS_ERR(inode)) {
                dprintk("nfs_get_root: get root inode failed\n");
-                return ERR_CAST(inode);
+                ret = ERR_CAST(inode);
+                goto out;
        }
        error = nfs_superblock_set_dummy_root(sb, inode);
-        if (error != 0)
+        if (error != 0) {
-                return ERR_PTR(error);
+                ret = ERR_PTR(error);
+                goto out;
+        }
        /* root dentries normally start off anonymous and get spliced in later
         * if the dentry tree reaches them; however if the dentry already
         * exists, we'll pick it up at this point and use it as the root
         */
-        mntroot = d_obtain_alias(inode);
+        ret = d_obtain_alias(inode);
-        if (IS_ERR(mntroot)) {
+        if (IS_ERR(ret)) {
                dprintk("nfs_get_root: get root dentry failed\n");
-                return mntroot;
+                goto out;
        }
-        security_d_instantiate(mntroot, inode);
+        security_d_instantiate(ret, inode);
-        if (!mntroot->d_op)
-                mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
-        return mntroot;
+        if (ret->d_op == NULL)
+                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+out:
+        nfs_free_fattr(fsinfo.fattr);
+        return ret;
 }
 #ifdef CONFIG_NFS_V4
-/*
+int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
- * Do a simple pathwalk from the root FH of the server to the nominated target
- * of the mountpoint
- * - give error on symlinks
- * - give error on ".." occurring in the path
- * - follow traversals
- */
-int nfs4_path_walk(struct nfs_server *server,
-                   struct nfs_fh *mntfh,
-                   const char *path)
 {
        struct nfs_fsinfo fsinfo;
-        struct nfs_fattr fattr;
+        int ret = -ENOMEM;
-        struct nfs_fh lastfh;
-        struct qstr name;
-        int ret;
-        dprintk("--> nfs4_path_walk(,,%s)\n", path);
+        dprintk("--> nfs4_get_rootfh()\n");
-        fsinfo.fattr = &fattr;
+        fsinfo.fattr = nfs_alloc_fattr();
-        nfs_fattr_init(&fattr);
+        if (fsinfo.fattr == NULL)
+                goto out;
-        /* Eat leading slashes */
-        while (*path == '/')
-                path++;
        /* Start by getting the root filehandle from the server */
        ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
        if (ret < 0) {
-                dprintk("nfs4_get_root: getroot error = %d\n", -ret);
+                dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
-                return ret;
+                goto out;
        }
-        if (!S_ISDIR(fattr.mode)) {
+        if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_MODE)
-                printk(KERN_ERR "nfs4_get_root:"
+                        || !S_ISDIR(fsinfo.fattr->mode)) {
+                printk(KERN_ERR "nfs4_get_rootfh:"
                       " getroot encountered non-directory\n");
-                return -ENOTDIR;
+                ret = -ENOTDIR;
+                goto out;
        }
-        /* FIXME: It is quite valid for the server to return a referral here */
+        if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
-        if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+                printk(KERN_ERR "nfs4_get_rootfh:"
-                printk(KERN_ERR "nfs4_get_root:"
                       " getroot obtained referral\n");
-                return -EREMOTE;
+                ret = -EREMOTE;
-        }
+                goto out;
-next_component:
-        dprintk("Next: %s\n", path);
-        /* extract the next bit of the path */
-        if (!*path)
-                goto path_walk_complete;
-        name.name = path;
-        while (*path && *path != '/')
-                path++;
-        name.len = path - (const char *) name.name;
-        if (name.len > NFS4_MAXNAMLEN)
-                return -ENAMETOOLONG;
-eat_dot_dir:
-        while (*path == '/')
-                path++;
-        if (path[0] == '.' && (path[1] == '/' || !path[1])) {
-                path += 2;
-                goto eat_dot_dir;
-        }
-        /* FIXME: Why shouldn't the user be able to use ".." in the path? */
-        if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2])
-            ) {
-                printk(KERN_ERR "nfs4_get_root:"
-                       " Mount path contains reference to \"..\"\n");
-                return -EINVAL;
        }
-        /* lookup the next FH in the sequence */
+        memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
-        memcpy(&lastfh, mntfh, sizeof(lastfh));
+out:
+        nfs_free_fattr(fsinfo.fattr);
-        dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path);
+        dprintk("<-- nfs4_get_rootfh() = %d\n", ret);
+        return ret;
-        ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name,
-                                                    mntfh, &fattr);
-        if (ret < 0) {
-                dprintk("nfs4_get_root: getroot error = %d\n", -ret);
-                return ret;
-        }
-        if (!S_ISDIR(fattr.mode)) {
-                printk(KERN_ERR "nfs4_get_root:"
-                       " lookupfh encountered non-directory\n");
-                return -ENOTDIR;
-        }
-        /* FIXME: Referrals are quite valid here too */
-        if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
-                printk(KERN_ERR "nfs4_get_root:"
-                       " lookupfh obtained referral\n");
-                return -EREMOTE;
-        }
-        goto next_component;
-path_walk_complete:
-        memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
-        dprintk("<-- nfs4_path_walk() = 0\n");
-        return 0;
 }
 /*
@@ -239,8 +174,8 @@ path_walk_complete:
 struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 {
        struct nfs_server *server = NFS_SB(sb);
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr = NULL;
-        struct dentry *mntroot;
+        struct dentry *ret;
        struct inode *inode;
        int error;
@@ -254,40 +189,50 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
                return ERR_PTR(error);
        }
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                return ERR_PTR(-ENOMEM);;
        /* get the actual root for this mount */
-        error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+        error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
        if (error < 0) {
                dprintk("nfs_get_root: getattr error = %d\n", -error);
-                return ERR_PTR(error);
+                ret = ERR_PTR(error);
+                goto out;
        }
-        inode = nfs_fhget(sb, mntfh, &fattr);
+        inode = nfs_fhget(sb, mntfh, fattr);
        if (IS_ERR(inode)) {
                dprintk("nfs_get_root: get root inode failed\n");
-                return ERR_CAST(inode);
+                ret = ERR_CAST(inode);
+                goto out;
        }
        error = nfs_superblock_set_dummy_root(sb, inode);
-        if (error != 0)
+        if (error != 0) {
-                return ERR_PTR(error);
+                ret = ERR_PTR(error);
+                goto out;
+        }
        /* root dentries normally start off anonymous and get spliced in later
         * if the dentry tree reaches them; however if the dentry already
         * exists, we'll pick it up at this point and use it as the root
         */
-        mntroot = d_obtain_alias(inode);
+        ret = d_obtain_alias(inode);
-        if (IS_ERR(mntroot)) {
+        if (IS_ERR(ret)) {
                dprintk("nfs_get_root: get root dentry failed\n");
-                return mntroot;
+                goto out;
        }
-        security_d_instantiate(mntroot, inode);
+        security_d_instantiate(ret, inode);
-        if (!mntroot->d_op)
+        if (ret->d_op == NULL)
-                mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
+                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+out:
+        nfs_free_fattr(fattr);
        dprintk("<-- nfs4_get_root()\n");
-        return mntroot;
+        return ret;
 }
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 657201acda84..099b3518feea 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/nfs_xdr.h>
+#include <linux/slab.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -392,8 +393,8 @@ int
 nfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
-        int error;
+        int error = -ENOMEM;
        nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -416,14 +417,20 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
                filemap_write_and_wait(inode->i_mapping);
                nfs_wb_all(inode);
        }
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto out;
        /*
         * Return any delegations if we're going to change ACLs
         */
        if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
                nfs_inode_return_delegation(inode);
-        error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
+        error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
        if (error == 0)
-                nfs_refresh_inode(inode, &fattr);
+                nfs_refresh_inode(inode, fattr);
+        nfs_free_fattr(fattr);
+out:
        return error;
 }
@@ -622,10 +629,10 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
        list_for_each_entry(pos, &nfsi->open_files, list) {
                if (cred != NULL && pos->cred != cred)
                        continue;
-                if ((pos->mode & mode) == mode) {
+                if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
-                        ctx = get_nfs_open_context(pos);
+                        continue;
-                        break;
+                ctx = get_nfs_open_context(pos);
-                }
+                break;
        }
        spin_unlock(&inode->i_lock);
        return ctx;
@@ -681,7 +688,7 @@ int
 __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
        int              status = -ESTALE;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr = NULL;
        struct nfs_inode *nfsi = NFS_I(inode);
        dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
@@ -692,8 +699,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        if (NFS_STALE(inode))
                goto out;
+        status = -ENOMEM;
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto out;
        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-        status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
+        status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
        if (status != 0) {
                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
                         inode->i_sb->s_id,
@@ -706,7 +718,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                goto out;
        }
-        status = nfs_refresh_inode(inode, &fattr);
+        status = nfs_refresh_inode(inode, fattr);
        if (status) {
                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
                         inode->i_sb->s_id,
@@ -722,6 +734,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                (long long)NFS_FILEID(inode));
 out:
+        nfs_free_fattr(fattr);
        return status;
 }
@@ -729,11 +742,16 @@ int nfs_attribute_timeout(struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        if (nfs_have_delegation(inode, FMODE_READ))
-                return 0;
        return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
+static int nfs_attribute_cache_expired(struct inode *inode)
+{
+        if (nfs_have_delegated_attributes(inode))
+                return 0;
+        return nfs_attribute_timeout(inode);
+}
 /**
 * nfs_revalidate_inode - Revalidate the inode attributes
 * @server - pointer to nfs_server struct
@@ -744,7 +762,7 @@ int nfs_attribute_timeout(struct inode *inode)
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
        if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
-                        && !nfs_attribute_timeout(inode))
+                        && !nfs_attribute_cache_expired(inode))
                return NFS_STALE(inode) ? -ESTALE : 0;
        return __nfs_revalidate_inode(server, inode);
 }
@@ -781,7 +799,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
        int ret = 0;
        if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
-                        || nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
+                        || nfs_attribute_cache_expired(inode)
+                        || NFS_STALE(inode)) {
                ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
                if (ret < 0)
                        goto out;
@@ -915,6 +934,26 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
        fattr->gencount = nfs_inc_attr_generation_counter();
 }
+struct nfs_fattr *nfs_alloc_fattr(void)
+{
+        struct nfs_fattr *fattr;
+        fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
+        if (fattr != NULL)
+                nfs_fattr_init(fattr);
+        return fattr;
+}
+struct nfs_fh *nfs_alloc_fhandle(void)
+{
+        struct nfs_fh *fh;
+        fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
+        if (fh != NULL)
+                fh->size = 0;
+        return fh;
+}
 /**
 * nfs_inode_attrs_need_update - check if the inode attributes need updating
 * @inode - pointer to inode
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 11f82f03c5de..d8bd619e386c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -244,9 +244,7 @@ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
 #ifdef CONFIG_NFS_V4
 extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
-extern int nfs4_path_walk(struct nfs_server *server,
+extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
-                          struct nfs_fh *mntfh,
-                          const char *path);
 #endif
 /* read.c */
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 1d8d5c813b01..c5832487c456 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -36,14 +36,14 @@ static inline void nfs_inc_stats(const struct inode *inode,
 static inline void nfs_add_server_stats(const struct nfs_server *server,
                                        enum nfs_stat_bytecounters stat,
-                                        unsigned long addend)
+                                        long addend)
 {
        this_cpu_add(server->io_stats->bytes[stat], addend);
 }
 static inline void nfs_add_stats(const struct inode *inode,
                                 enum nfs_stat_bytecounters stat,
-                                 unsigned long addend)
+                                 long addend)
 {
        nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
@@ -51,7 +51,7 @@ static inline void nfs_add_stats(const struct inode *inode,
 #ifdef CONFIG_NFS_FSCACHE
 static inline void nfs_add_fscache_stats(struct inode *inode,
                                         enum nfs_stat_fscachecounters stat,
-                                         unsigned long addend)
+                                         long addend)
 {
        this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
 }
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 40c766782891..db6aa3673cf3 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -8,6 +8,7 @@
 */
 #include <linux/dcache.h>
+#include <linux/gfp.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/nfs_fs.h>
@@ -104,8 +105,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        struct vfsmount *mnt;
        struct nfs_server *server = NFS_SERVER(dentry->d_inode);
        struct dentry *parent;
-        struct nfs_fh fh;
+        struct nfs_fh *fh = NULL;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr = NULL;
        int err;
        dprintk("--> nfs_follow_mountpoint()\n");
@@ -114,6 +115,12 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        if (IS_ROOT(dentry))
                goto out_err;
+        err = -ENOMEM;
+        fh = nfs_alloc_fhandle();
+        fattr = nfs_alloc_fattr();
+        if (fh == NULL || fattr == NULL)
+                goto out_err;
        dprintk("%s: enter\n", __func__);
        dput(nd->path.dentry);
        nd->path.dentry = dget(dentry);
@@ -122,16 +129,16 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        parent = dget_parent(nd->path.dentry);
        err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
                                                  &nd->path.dentry->d_name,
-                                                  &fh, &fattr);
+                                                  fh, fattr);
        dput(parent);
        if (err != 0)
                goto out_err;
-        if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL)
+        if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
                mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
        else
-                mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, &fh,
+                mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
-                                      &fattr);
+                                      fattr);
        err = PTR_ERR(mnt);
        if (IS_ERR(mnt))
                goto out_err;
@@ -150,6 +157,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        nd->path.dentry = dget(mnt->mnt_root);
        schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fh);
        dprintk("%s: done, returned %d\n", __func__, err);
        dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 7bc2da8efd4a..81cf14257916 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -12,7 +12,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index bac60515a4b3..9f88c5f4c7e2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -1,4 +1,5 @@
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
@@ -184,7 +185,6 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
 struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs_fattr fattr;
        struct page *pages[NFSACL_MAXPAGES] = { };
        struct nfs3_getaclargs args = {
                .fh = NFS_FH(inode),
@@ -192,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
                .pages = pages,
        };
        struct nfs3_getaclres res = {
-                .fattr =        &fattr,
+                0
        };
        struct rpc_message msg = {
                .rpc_argp       = &args,
@@ -227,7 +227,10 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
        dprintk("NFS call getacl\n");
        msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
-        nfs_fattr_init(&fattr);
+        res.fattr = nfs_alloc_fattr();
+        if (res.fattr == NULL)
+                return ERR_PTR(-ENOMEM);
        status = rpc_call_sync(server->client_acl, &msg, 0);
        dprintk("NFS reply getacl: %d\n", status);
@@ -237,7 +240,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
        switch (status) {
                case 0:
-                        status = nfs_refresh_inode(inode, &fattr);
+                        status = nfs_refresh_inode(inode, res.fattr);
                        break;
                case -EPFNOSUPPORT:
                case -EPROTONOSUPPORT:
@@ -277,6 +280,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 getout:
        posix_acl_release(res.acl_access);
        posix_acl_release(res.acl_default);
+        nfs_free_fattr(res.fattr);
        if (status != 0) {
                posix_acl_release(acl);
@@ -289,7 +293,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
                  struct posix_acl *dfacl)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
        struct page *pages[NFSACL_MAXPAGES];
        struct nfs3_setaclargs args = {
                .inode = inode,
@@ -334,8 +338,13 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        }
        dprintk("NFS call setacl\n");
+        status = -ENOMEM;
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto out_freepages;
        msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
-        nfs_fattr_init(&fattr);
+        msg.rpc_resp = fattr;
        status = rpc_call_sync(server->client_acl, &msg, 0);
        nfs_access_zap_cache(inode);
        nfs_zap_acl_cache(inode);
@@ -343,7 +352,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        switch (status) {
                case 0:
-                        status = nfs_refresh_inode(inode, &fattr);
+                        status = nfs_refresh_inode(inode, fattr);
                        nfs3_cache_acls(inode, acl, dfacl);
                        break;
                case -EPFNOSUPPORT:
@@ -354,6 +363,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
                case -ENOTSUPP:
                        status = -EOPNOTSUPP;
        }
+        nfs_free_fattr(fattr);
 out_freepages:
        while (args.npages != 0) {
                args.npages--;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 24992f0a29f2..fabb4f2849a1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/slab.h>
 #include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
@@ -143,14 +144,12 @@ static int
 nfs3_proc_lookup(struct inode *dir, struct qstr *name,
                 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
-        struct nfs_fattr        dir_attr;
        struct nfs3_diropargs   arg = {
                .fh             = NFS_FH(dir),
                .name           = name->name,
                .len            = name->len
        };
        struct nfs3_diropres    res = {
-                .dir_attr       = &dir_attr,
                .fh             = fhandle,
                .fattr          = fattr
        };
@@ -162,29 +161,30 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
        int                     status;
        dprintk("NFS call  lookup %s\n", name->name);
-        nfs_fattr_init(&dir_attr);
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.dir_attr == NULL)
+                return -ENOMEM;
        nfs_fattr_init(fattr);
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_refresh_inode(dir, &dir_attr);
+        nfs_refresh_inode(dir, res.dir_attr);
        if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
                msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
                msg.rpc_argp = fhandle;
                msg.rpc_resp = fattr;
                status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        }
+        nfs_free_fattr(res.dir_attr);
        dprintk("NFS reply lookup: %d\n", status);
        return status;
 }
 static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
-        struct nfs_fattr        fattr;
        struct nfs3_accessargs  arg = {
                .fh             = NFS_FH(inode),
        };
-        struct nfs3_accessres   res = {
+        struct nfs3_accessres   res;
-                .fattr          = &fattr,
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_ACCESS],
                .rpc_argp       = &arg,
@@ -192,7 +192,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
                .rpc_cred       = entry->cred,
        };
        int mode = entry->mask;
-        int status;
+        int status = -ENOMEM;
        dprintk("NFS call  access\n");
@@ -209,9 +209,13 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
                if (mode & MAY_EXEC)
                        arg.access |= NFS3_ACCESS_EXECUTE;
        }
-        nfs_fattr_init(&fattr);
+        res.fattr = nfs_alloc_fattr();
+        if (res.fattr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-        nfs_refresh_inode(inode, &fattr);
+        nfs_refresh_inode(inode, res.fattr);
        if (status == 0) {
                entry->mask = 0;
                if (res.access & NFS3_ACCESS_READ)
@@ -221,6 +225,8 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
                if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
                        entry->mask |= MAY_EXEC;
        }
+        nfs_free_fattr(res.fattr);
+out:
        dprintk("NFS reply access: %d\n", status);
        return status;
 }
@@ -228,7 +234,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 static int nfs3_proc_readlink(struct inode *inode, struct page *page,
                unsigned int pgbase, unsigned int pglen)
 {
-        struct nfs_fattr        fattr;
+        struct nfs_fattr        *fattr;
        struct nfs3_readlinkargs args = {
                .fh             = NFS_FH(inode),
                .pgbase         = pgbase,
@@ -238,14 +244,19 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_READLINK],
                .rpc_argp       = &args,
-                .rpc_resp       = &fattr,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  readlink\n");
-        nfs_fattr_init(&fattr);
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto out;
+        msg.rpc_resp = fattr;
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-        nfs_refresh_inode(inode, &fattr);
+        nfs_refresh_inode(inode, fattr);
+        nfs_free_fattr(fattr);
+out:
        dprintk("NFS reply readlink: %d\n", status);
        return status;
 }
@@ -395,12 +406,17 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name)
                .rpc_argp = &arg,
                .rpc_resp = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  remove %s\n", name->name);
-        nfs_fattr_init(&res.dir_attr);
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.dir_attr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_post_op_update_inode(dir, &res.dir_attr);
+        nfs_post_op_update_inode(dir, res.dir_attr);
+        nfs_free_fattr(res.dir_attr);
+out:
        dprintk("NFS reply remove: %d\n", status);
        return status;
 }
@@ -418,7 +434,7 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        if (nfs3_async_handle_jukebox(task, dir))
                return 0;
        res = task->tk_msg.rpc_resp;
-        nfs_post_op_update_inode(dir, &res->dir_attr);
+        nfs_post_op_update_inode(dir, res->dir_attr);
        return 1;
 }
@@ -426,7 +442,6 @@ static int
 nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
                 struct inode *new_dir, struct qstr *new_name)
 {
-        struct nfs_fattr        old_dir_attr, new_dir_attr;
        struct nfs3_renameargs  arg = {
                .fromfh         = NFS_FH(old_dir),
                .fromname       = old_name->name,
@@ -435,23 +450,27 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
                .toname         = new_name->name,
                .tolen          = new_name->len
        };
-        struct nfs3_renameres   res = {
+        struct nfs3_renameres res;
-                .fromattr       = &old_dir_attr,
-                .toattr         = &new_dir_attr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_RENAME],
                .rpc_argp       = &arg,
                .rpc_resp       = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
-        nfs_fattr_init(&old_dir_attr);
-        nfs_fattr_init(&new_dir_attr);
+        res.fromattr = nfs_alloc_fattr();
+        res.toattr = nfs_alloc_fattr();
+        if (res.fromattr == NULL || res.toattr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
-        nfs_post_op_update_inode(old_dir, &old_dir_attr);
+        nfs_post_op_update_inode(old_dir, res.fromattr);
-        nfs_post_op_update_inode(new_dir, &new_dir_attr);
+        nfs_post_op_update_inode(new_dir, res.toattr);
+out:
+        nfs_free_fattr(res.toattr);
+        nfs_free_fattr(res.fromattr);
        dprintk("NFS reply rename: %d\n", status);
        return status;
 }
@@ -459,30 +478,32 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
 static int
 nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 {
-        struct nfs_fattr        dir_attr, fattr;
        struct nfs3_linkargs    arg = {
                .fromfh         = NFS_FH(inode),
                .tofh           = NFS_FH(dir),
                .toname         = name->name,
                .tolen          = name->len
        };
-        struct nfs3_linkres     res = {
+        struct nfs3_linkres     res;
-                .dir_attr       = &dir_attr,
-                .fattr          = &fattr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_LINK],
                .rpc_argp       = &arg,
                .rpc_resp       = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  link %s\n", name->name);
-        nfs_fattr_init(&dir_attr);
+        res.fattr = nfs_alloc_fattr();
-        nfs_fattr_init(&fattr);
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.fattr == NULL || res.dir_attr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-        nfs_post_op_update_inode(dir, &dir_attr);
+        nfs_post_op_update_inode(dir, res.dir_attr);
-        nfs_post_op_update_inode(inode, &fattr);
+        nfs_post_op_update_inode(inode, res.fattr);
+out:
+        nfs_free_fattr(res.dir_attr);
+        nfs_free_fattr(res.fattr);
        dprintk("NFS reply link: %d\n", status);
        return status;
 }
@@ -553,7 +574,7 @@ out:
 static int
 nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
 {
-        struct nfs_fattr        dir_attr;
+        struct nfs_fattr        *dir_attr;
        struct nfs3_diropargs   arg = {
                .fh             = NFS_FH(dir),
                .name           = name->name,
@@ -562,14 +583,19 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_RMDIR],
                .rpc_argp       = &arg,
-                .rpc_resp       = &dir_attr,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  rmdir %s\n", name->name);
-        nfs_fattr_init(&dir_attr);
+        dir_attr = nfs_alloc_fattr();
+        if (dir_attr == NULL)
+                goto out;
+        msg.rpc_resp = dir_attr;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_post_op_update_inode(dir, &dir_attr);
+        nfs_post_op_update_inode(dir, dir_attr);
+        nfs_free_fattr(dir_attr);
+out:
        dprintk("NFS reply rmdir: %d\n", status);
        return status;
 }
@@ -588,7 +614,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                  u64 cookie, struct page *page, unsigned int count, int plus)
 {
        struct inode            *dir = dentry->d_inode;
-        struct nfs_fattr        dir_attr;
        __be32                  *verf = NFS_COOKIEVERF(dir);
        struct nfs3_readdirargs arg = {
                .fh             = NFS_FH(dir),
@@ -599,7 +624,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                .pages          = &page
        };
        struct nfs3_readdirres  res = {
-                .dir_attr       = &dir_attr,
                .verf           = verf,
                .plus           = plus
        };
@@ -609,7 +633,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                .rpc_resp       = &res,
                .rpc_cred       = cred
        };
-        int                     status;
+        int status = -ENOMEM;
        if (plus)
                msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
@@ -617,12 +641,17 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
        dprintk("NFS call  readdir%s %d\n",
                        plus? "plus" : "", (unsigned int) cookie);
-        nfs_fattr_init(&dir_attr);
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.dir_attr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_invalidate_atime(dir);
+        nfs_refresh_inode(dir, res.dir_attr);
-        nfs_refresh_inode(dir, &dir_attr);
+        nfs_free_fattr(res.dir_attr);
+out:
        dprintk("NFS reply readdir: %d\n", status);
        return status;
 }
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 5fe5492fbd29..75dcfc7da365 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -9,7 +9,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
@@ -763,7 +762,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 static int
 nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
 {
-        return nfs3_xdr_wccstat(req, p, &res->dir_attr);
+        return nfs3_xdr_wccstat(req, p, res->dir_attr);
 }
 /*
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a187200a7aac..c538c6106e16 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -206,14 +206,14 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
-extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
+extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
-extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
 extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
 extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
@@ -286,7 +286,7 @@ extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
-extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter);
+extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
 extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
 extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
 extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index fa3408f20112..3c2a1724fbd2 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -11,6 +11,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/vfs.h>
@@ -114,6 +115,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                                     char *page, char *page2,
                                     const struct nfs4_fs_location *location)
 {
+        const size_t addr_bufsize = sizeof(struct sockaddr_storage);
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
        char *mnt_path;
        unsigned int maxbuflen;
@@ -125,9 +127,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
        mountdata->mnt_path = mnt_path;
        maxbuflen = mnt_path - 1 - page2;
+        mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL);
+        if (mountdata->addr == NULL)
+                return ERR_PTR(-ENOMEM);
        for (s = 0; s < location->nservers; s++) {
                const struct nfs4_string *buf = &location->servers[s];
-                struct sockaddr_storage addr;
                if (buf->len <= 0 || buf->len >= maxbuflen)
                        continue;
@@ -136,11 +141,10 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                        continue;
                mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
-                                (struct sockaddr *)&addr, sizeof(addr));
+                                mountdata->addr, addr_bufsize);
                if (mountdata->addrlen == 0)
                        continue;
-                mountdata->addr = (struct sockaddr *)&addr;
                rpc_set_port(mountdata->addr, NFS_PORT);
                memcpy(page2, buf->data, buf->len);
@@ -155,6 +159,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                if (!IS_ERR(mnt))
                        break;
        }
+        kfree(mountdata->addr);
        return mnt;
 }
@@ -220,8 +225,8 @@ out:
 /*
 * nfs_do_refmount - handle crossing a referral on server
+ * @mnt_parent - mountpoint of referral
 * @dentry - dentry of referral
- * @nd - nameidata info
 *
 */
 struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index eda74c42d552..70015dd60a98 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -39,6 +39,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs.h>
 #include <linux/nfs4.h>
@@ -69,6 +70,9 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+                            struct nfs_fattr *fattr, struct iattr *sattr,
+                            struct nfs4_state *state);
 /* Prevent leaks of NFSv4 errors into userland */
 static int nfs4_map_errors(int err)
@@ -713,17 +717,18 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
                struct nfs4_state_owner *sp, fmode_t fmode, int flags,
-                const struct iattr *attrs)
+                const struct iattr *attrs,
+                gfp_t gfp_mask)
 {
        struct dentry *parent = dget_parent(path->dentry);
        struct inode *dir = parent->d_inode;
        struct nfs_server *server = NFS_SERVER(dir);
        struct nfs4_opendata *p;
-        p = kzalloc(sizeof(*p), GFP_KERNEL);
+        p = kzalloc(sizeof(*p), gfp_mask);
        if (p == NULL)
                goto err;
-        p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
+        p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
        if (p->o_arg.seqid == NULL)
                goto err_free;
        path_get(path);
@@ -1059,7 +1064,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
 {
        struct nfs4_opendata *opendata;
-        opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL);
+        opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS);
        if (opendata == NULL)
                return ERR_PTR(-ENOMEM);
        opendata->state = state;
@@ -1522,6 +1527,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
                nfs_post_op_update_inode(dir, o_res->dir_attr);
        } else
                nfs_refresh_inode(dir, o_res->dir_attr);
+        if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
+                server->caps &= ~NFS_CAP_POSIX_LOCK;
        if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
                status = _nfs4_proc_open_confirm(data);
                if (status != 0)
@@ -1645,7 +1652,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
        if (path->dentry->d_inode != NULL)
                nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
        status = -ENOMEM;
-        opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr);
+        opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL);
        if (opendata == NULL)
                goto err_put_state_owner;
@@ -1656,15 +1663,24 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
        if (status != 0)
                goto err_opendata_put;
-        if (opendata->o_arg.open_flags & O_EXCL)
-                nfs4_exclusive_attrset(opendata, sattr);
        state = nfs4_opendata_to_nfs4_state(opendata);
        status = PTR_ERR(state);
        if (IS_ERR(state))
                goto err_opendata_put;
-        if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0)
+        if (server->caps & NFS_CAP_POSIX_LOCK)
                set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+        if (opendata->o_arg.open_flags & O_EXCL) {
+                nfs4_exclusive_attrset(opendata, sattr);
+                nfs_fattr_init(opendata->o_res.f_attr);
+                status = nfs4_do_setattr(state->inode, cred,
+                                opendata->o_res.f_attr, sattr,
+                                state);
+                if (status == 0)
+                        nfs_setattr_update_inode(state->inode, sattr);
+                nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+        }
        nfs4_opendata_put(opendata);
        nfs4_put_state_owner(sp);
        *res = state;
@@ -1911,7 +1927,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
 *
 * NOTE: Caller must be holding the sp->so_owner semaphore!
 */
-int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        struct nfs4_closedata *calldata;
@@ -1930,7 +1946,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        };
        int status = -ENOMEM;
-        calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
+        calldata = kzalloc(sizeof(*calldata), gfp_mask);
        if (calldata == NULL)
                goto out;
        calldata->inode = state->inode;
@@ -1938,7 +1954,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        calldata->arg.fh = NFS_FH(state->inode);
        calldata->arg.stateid = &state->open_stateid;
        /* Serialization for the sequence id */
-        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
+        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask);
        if (calldata->arg.seqid == NULL)
                goto out_free_calldata;
        calldata->arg.fmode = 0;
@@ -2067,8 +2083,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
                        case -EDQUOT:
                        case -ENOSPC:
                        case -EROFS:
-                                lookup_instantiate_filp(nd, (struct dentry *)state, NULL);
+                                return PTR_ERR(state);
-                                return 1;
                        default:
                                goto out_drop;
                }
@@ -2402,14 +2417,12 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs_fattr fattr;
        struct nfs4_accessargs args = {
                .fh = NFS_FH(inode),
                .bitmask = server->attr_bitmask,
        };
        struct nfs4_accessres res = {
                .server = server,
-                .fattr = &fattr,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
@@ -2436,7 +2449,11 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
                if (mode & MAY_EXEC)
                        args.access |= NFS4_ACCESS_EXECUTE;
        }
-        nfs_fattr_init(&fattr);
+        res.fattr = nfs_alloc_fattr();
+        if (res.fattr == NULL)
+                return -ENOMEM;
        status = nfs4_call_sync(server, &msg, &args, &res, 0);
        if (!status) {
                entry->mask = 0;
@@ -2446,8 +2463,9 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
                        entry->mask |= MAY_WRITE;
                if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
                        entry->mask |= MAY_EXEC;
-                nfs_refresh_inode(inode, &fattr);
+                nfs_refresh_inode(inode, res.fattr);
        }
+        nfs_free_fattr(res.fattr);
        return status;
 }
@@ -2560,13 +2578,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        }
        d_add(dentry, igrab(state->inode));
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-        if (flags & O_EXCL) {
-                struct nfs_fattr fattr;
-                status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
-                if (status == 0)
-                        nfs_setattr_update_inode(state->inode, sattr);
-                nfs_post_op_update_inode(state->inode, &fattr);
-        }
        if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
                status = nfs4_intent_set_file(nd, &path, state, fmode);
        else
@@ -2594,14 +2605,19 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
                .rpc_argp = &args,
                .rpc_resp = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.dir_attr == NULL)
+                goto out;
-        nfs_fattr_init(&res.dir_attr);
        status = nfs4_call_sync(server, &msg, &args, &res, 1);
        if (status == 0) {
                update_changeattr(dir, &res.cinfo);
-                nfs_post_op_update_inode(dir, &res.dir_attr);
+                nfs_post_op_update_inode(dir, res.dir_attr);
        }
+        nfs_free_fattr(res.dir_attr);
+out:
        return status;
 }
@@ -2636,7 +2652,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
                return 0;
        update_changeattr(dir, &res->cinfo);
-        nfs_post_op_update_inode(dir, &res->dir_attr);
+        nfs_post_op_update_inode(dir, res->dir_attr);
        return 1;
 }
@@ -2651,29 +2667,31 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
                .new_name = new_name,
                .bitmask = server->attr_bitmask,
        };
-        struct nfs_fattr old_fattr, new_fattr;
        struct nfs4_rename_res res = {
                .server = server,
-                .old_fattr = &old_fattr,
-                .new_fattr = &new_fattr,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
                .rpc_argp = &arg,
                .rpc_resp = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        
-        nfs_fattr_init(res.old_fattr);
+        res.old_fattr = nfs_alloc_fattr();
-        nfs_fattr_init(res.new_fattr);
+        res.new_fattr = nfs_alloc_fattr();
-        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        if (res.old_fattr == NULL || res.new_fattr == NULL)
+                goto out;
+        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        if (!status) {
                update_changeattr(old_dir, &res.old_cinfo);
                nfs_post_op_update_inode(old_dir, res.old_fattr);
                update_changeattr(new_dir, &res.new_cinfo);
                nfs_post_op_update_inode(new_dir, res.new_fattr);
        }
+out:
+        nfs_free_fattr(res.new_fattr);
+        nfs_free_fattr(res.old_fattr);
        return status;
 }
@@ -2700,28 +2718,30 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
                .name   = name,
                .bitmask = server->attr_bitmask,
        };
-        struct nfs_fattr fattr, dir_attr;
        struct nfs4_link_res res = {
                .server = server,
-                .fattr = &fattr,
-                .dir_attr = &dir_attr,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
                .rpc_argp = &arg,
                .rpc_resp = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
+        res.fattr = nfs_alloc_fattr();
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.fattr == NULL || res.dir_attr == NULL)
+                goto out;
-        nfs_fattr_init(res.fattr);
-        nfs_fattr_init(res.dir_attr);
        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        if (!status) {
                update_changeattr(dir, &res.cinfo);
                nfs_post_op_update_inode(dir, res.dir_attr);
                nfs_post_op_update_inode(inode, res.fattr);
        }
+out:
+        nfs_free_fattr(res.dir_attr);
+        nfs_free_fattr(res.fattr);
        return status;
 }
@@ -3144,23 +3164,31 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 }
+struct nfs4_renewdata {
+        struct nfs_client       *client;
+        unsigned long           timestamp;
+};
 /*
 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
 * standalone procedure for queueing an asynchronous RENEW.
 */
-static void nfs4_renew_release(void *data)
+static void nfs4_renew_release(void *calldata)
 {
-        struct nfs_client *clp = data;
+        struct nfs4_renewdata *data = calldata;
+        struct nfs_client *clp = data->client;
        if (atomic_read(&clp->cl_count) > 1)
                nfs4_schedule_state_renewal(clp);
        nfs_put_client(clp);
+        kfree(data);
 }
-static void nfs4_renew_done(struct rpc_task *task, void *data)
+static void nfs4_renew_done(struct rpc_task *task, void *calldata)
 {
-        struct nfs_client *clp = data;
+        struct nfs4_renewdata *data = calldata;
-        unsigned long timestamp = task->tk_start;
+        struct nfs_client *clp = data->client;
+        unsigned long timestamp = data->timestamp;
        if (task->tk_status < 0) {
                /* Unless we're shutting down, schedule state recovery! */
@@ -3186,11 +3214,17 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
                .rpc_argp       = clp,
                .rpc_cred       = cred,
        };
+        struct nfs4_renewdata *data;
        if (!atomic_inc_not_zero(&clp->cl_count))
                return -EIO;
+        data = kmalloc(sizeof(*data), GFP_KERNEL);
+        if (data == NULL)
+                return -ENOMEM;
+        data->client = clp;
+        data->timestamp = jiffies;
        return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
-                        &nfs4_renew_ops, clp);
+                        &nfs4_renew_ops, data);
 }
 int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3492,7 +3526,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
        return _nfs4_async_handle_error(task, server, server->nfs_client, state);
 }
-int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
+int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
+                unsigned short port, struct rpc_cred *cred,
+                struct nfs4_setclientid_res *res)
 {
        nfs4_verifier sc_verifier;
        struct nfs4_setclientid setclientid = {
@@ -3502,7 +3538,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
                .rpc_argp = &setclientid,
-                .rpc_resp = clp,
+                .rpc_resp = res,
                .rpc_cred = cred,
        };
        __be32 *p;
@@ -3545,12 +3581,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
        return status;
 }
-static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
+static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+                struct nfs4_setclientid_res *arg,
+                struct rpc_cred *cred)
 {
        struct nfs_fsinfo fsinfo;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
-                .rpc_argp = clp,
+                .rpc_argp = arg,
                .rpc_resp = &fsinfo,
                .rpc_cred = cred,
        };
@@ -3568,12 +3606,14 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
        return status;
 }
-int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+                struct nfs4_setclientid_res *arg,
+                struct rpc_cred *cred)
 {
        long timeout = 0;
        int err;
        do {
-                err = _nfs4_proc_setclientid_confirm(clp, cred);
+                err = _nfs4_proc_setclientid_confirm(clp, arg, cred);
                switch (err) {
                        case 0:
                                return err;
@@ -3665,7 +3705,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        };
        int status = 0;
-        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        data = kzalloc(sizeof(*data), GFP_NOFS);
        if (data == NULL)
                return -ENOMEM;
        data->args.fhandle = &data->fh;
@@ -3821,7 +3861,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
        struct nfs4_unlockdata *p;
        struct inode *inode = lsp->ls_state->inode;
-        p = kzalloc(sizeof(*p), GFP_KERNEL);
+        p = kzalloc(sizeof(*p), GFP_NOFS);
        if (p == NULL)
                return NULL;
        p->arg.fh = NFS_FH(inode);
@@ -3959,7 +3999,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
        if (test_bit(NFS_DELEGATED_STATE, &state->flags))
                goto out;
        lsp = request->fl_u.nfs4_fl.owner;
-        seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+        seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
        status = -ENOMEM;
        if (seqid == NULL)
                goto out;
@@ -3987,22 +4027,23 @@ struct nfs4_lockdata {
 };
 static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
-                struct nfs_open_context *ctx, struct nfs4_lock_state *lsp)
+                struct nfs_open_context *ctx, struct nfs4_lock_state *lsp,
+                gfp_t gfp_mask)
 {
        struct nfs4_lockdata *p;
        struct inode *inode = lsp->ls_state->inode;
        struct nfs_server *server = NFS_SERVER(inode);
-        p = kzalloc(sizeof(*p), GFP_KERNEL);
+        p = kzalloc(sizeof(*p), gfp_mask);
        if (p == NULL)
                return NULL;
        p->arg.fh = NFS_FH(inode);
        p->arg.fl = &p->fl;
-        p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid);
+        p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
        if (p->arg.open_seqid == NULL)
                goto out_free;
-        p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+        p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask);
        if (p->arg.lock_seqid == NULL)
                goto out_free_seqid;
        p->arg.lock_stateid = &lsp->ls_stateid;
@@ -4156,7 +4197,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
        dprintk("%s: begin!\n", __func__);
        data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
-                        fl->fl_u.nfs4_fl.owner);
+                        fl->fl_u.nfs4_fl.owner,
+                        recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
        if (data == NULL)
                return -ENOMEM;
        if (IS_SETLKW(cmd))
@@ -4645,7 +4687,7 @@ static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
        if (max_reqs != tbl->max_slots) {
                ret = -ENOMEM;
                new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
-                              GFP_KERNEL);
+                              GFP_NOFS);
                if (!new)
                        goto out;
                ret = 0;
@@ -4710,7 +4752,7 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
        dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
-        slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL);
+        slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS);
        if (!slot)
                goto out;
        ret = 0;
@@ -4759,7 +4801,7 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        struct nfs4_session *session;
        struct nfs4_slot_table *tbl;
-        session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL);
+        session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
        if (!session)
                return NULL;
@@ -5103,10 +5145,11 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
        if (!atomic_inc_not_zero(&clp->cl_count))
                return -EIO;
-        args = kzalloc(sizeof(*args), GFP_KERNEL);
+        args = kzalloc(sizeof(*args), GFP_NOFS);
-        res = kzalloc(sizeof(*res), GFP_KERNEL);
+        res = kzalloc(sizeof(*res), GFP_NOFS);
        if (!args || !res) {
                kfree(args);
+                kfree(res);
                nfs_put_client(clp);
                return -ENOMEM;
        }
@@ -5204,7 +5247,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
        int status = -ENOMEM;
        dprintk("--> %s\n", __func__);
-        calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
+        calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
        if (calldata == NULL)
                goto out;
        calldata->clp = clp;
@@ -5215,9 +5258,12 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
        msg.rpc_resp = &calldata->res;
        task_setup_data.callback_data = calldata;
        task = rpc_run_task(&task_setup_data);
-        if (IS_ERR(task))
+        if (IS_ERR(task)) {
                status = PTR_ERR(task);
+                goto out;
+        }
        rpc_put_task(task);
+        return 0;
 out:
        dprintk("<-- %s status=%d\n", __func__, status);
        return status;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6c5ed51f105e..34acf5926fdc 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,6 +62,7 @@ static LIST_HEAD(nfs4_clientid_list);
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
+        struct nfs4_setclientid_res clid;
        unsigned short port;
        int status;
@@ -69,11 +70,15 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
        if (clp->cl_addr.ss_family == AF_INET6)
                port = nfs_callback_tcpport6;
-        status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
+        status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
-        if (status == 0)
+        if (status != 0)
-                status = nfs4_proc_setclientid_confirm(clp, cred);
+                goto out;
-        if (status == 0)
+        status = nfs4_proc_setclientid_confirm(clp, &clid, cred);
-                nfs4_schedule_state_renewal(clp);
+        if (status != 0)
+                goto out;
+        clp->cl_clientid = clid.clientid;
+        nfs4_schedule_state_renewal(clp);
+out:
        return status;
 }
@@ -361,7 +366,7 @@ nfs4_alloc_state_owner(void)
 {
        struct nfs4_state_owner *sp;
-        sp = kzalloc(sizeof(*sp),GFP_KERNEL);
+        sp = kzalloc(sizeof(*sp),GFP_NOFS);
        if (!sp)
                return NULL;
        spin_lock_init(&sp->so_lock);
@@ -435,7 +440,7 @@ nfs4_alloc_open_state(void)
 {
        struct nfs4_state *state;
-        state = kzalloc(sizeof(*state), GFP_KERNEL);
+        state = kzalloc(sizeof(*state), GFP_NOFS);
        if (!state)
                return NULL;
        atomic_set(&state->count, 1);
@@ -537,7 +542,8 @@ void nfs4_put_open_state(struct nfs4_state *state)
 /*
 * Close the current file.
 */
-static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait)
+static void __nfs4_close(struct path *path, struct nfs4_state *state,
+                fmode_t fmode, gfp_t gfp_mask, int wait)
 {
        struct nfs4_state_owner *owner = state->owner;
        int call_close = 0;
@@ -578,17 +584,17 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fm
                nfs4_put_open_state(state);
                nfs4_put_state_owner(owner);
        } else
-                nfs4_do_close(path, state, wait);
+                nfs4_do_close(path, state, gfp_mask, wait);
 }
 void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-        __nfs4_close(path, state, fmode, 0);
+        __nfs4_close(path, state, fmode, GFP_NOFS, 0);
 }
 void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-        __nfs4_close(path, state, fmode, 1);
+        __nfs4_close(path, state, fmode, GFP_KERNEL, 1);
 }
 /*
@@ -618,7 +624,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        struct nfs4_lock_state *lsp;
        struct nfs_client *clp = state->owner->so_client;
-        lsp = kzalloc(sizeof(*lsp), GFP_KERNEL);
+        lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
        if (lsp == NULL)
                return NULL;
        rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
@@ -754,11 +760,11 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
        nfs4_put_lock_state(lsp);
 }
-struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
+struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
 {
        struct nfs_seqid *new;
-        new = kmalloc(sizeof(*new), GFP_KERNEL);
+        new = kmalloc(sizeof(*new), gfp_mask);
        if (new != NULL) {
                new->sequence = counter;
                INIT_LIST_HEAD(&new->list);
@@ -1347,7 +1353,7 @@ static int nfs4_recall_slot(struct nfs_client *clp)
        nfs4_begin_drain_session(clp);
        new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
-                      GFP_KERNEL);
+                      GFP_NOFS);
        if (!new)
                return -ENOMEM;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4d338be492cb..6bdef28efa33 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -38,7 +38,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
@@ -1505,14 +1504,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
        hdr->replen += decode_setclientid_maxsz;
 }
-static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr)
+static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
        p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
        *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
-        p = xdr_encode_hyper(p, client_state->cl_clientid);
+        p = xdr_encode_hyper(p, arg->clientid);
-        xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+        xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE);
        hdr->nops++;
        hdr->replen += decode_setclientid_confirm_maxsz;
 }
@@ -2325,7 +2324,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
 /*
 * a SETCLIENTID_CONFIRM request
 */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
+static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -2335,7 +2334,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, req, &hdr);
-        encode_setclientid_confirm(&xdr, clp, &hdr);
+        encode_setclientid_confirm(&xdr, arg, &hdr);
        encode_putrootfh(&xdr, &hdr);
        encode_fsinfo(&xdr, lease_bitmap, &hdr);
        encode_nops(&hdr);
@@ -4398,7 +4397,7 @@ out_overflow:
        return -EIO;
 }
-static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
+static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res)
 {
        __be32 *p;
        uint32_t opnum;
@@ -4418,8 +4417,8 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
                p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
                if (unlikely(!p))
                        goto out_overflow;
-                p = xdr_decode_hyper(p, &clp->cl_clientid);
+                p = xdr_decode_hyper(p, &res->clientid);
-                memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
+                memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
        } else if (nfserr == NFSERR_CLID_INUSE) {
                uint32_t len;
@@ -4816,7 +4815,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
                goto out;
        if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
                goto out;
-        decode_getfattr(&xdr, &res->dir_attr, res->server,
+        decode_getfattr(&xdr, res->dir_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5499,7 +5498,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
 * Decode SETCLIENTID response
 */
 static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
-                struct nfs_client *clp)
+                struct nfs4_setclientid_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -5508,7 +5507,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_setclientid(&xdr, clp);
+                status = decode_setclientid(&xdr, res);
        return status;
 }
@@ -5552,6 +5551,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
        if (status != 0)
                goto out;
        status = decode_delegreturn(&xdr);
+        if (status != 0)
+                goto out;
        decode_getfattr(&xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 8c55b27c0de4..6bd19d843af7 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -488,7 +488,6 @@ static int __init root_nfs_ports(void)
 */
 static int __init root_nfs_get_handle(void)
 {
-        struct nfs_fh fh;
        struct sockaddr_in sin;
        unsigned int auth_flav_len = 0;
        struct nfs_mount_request request = {
@@ -499,21 +498,24 @@ static int __init root_nfs_get_handle(void)
                                        NFS_MNT3_VERSION : NFS_MNT_VERSION,
                .protocol       = (nfs_data.flags & NFS_MOUNT_TCP) ?
                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
-                .fh             = &fh,
                .auth_flav_len  = &auth_flav_len,
        };
-        int status;
+        int status = -ENOMEM;
+        request.fh = nfs_alloc_fhandle();
+        if (!request.fh)
+                goto out;
        set_sockaddr(&sin, servaddr, htons(mount_port));
        status = nfs_mount(&request);
        if (status < 0)
                printk(KERN_ERR "Root-NFS: Server returned error %d "
                                "while mounting %s\n", status, nfs_export_path);
        else {
-                nfs_data.root.size = fh.size;
+                nfs_data.root.size = request.fh->size;
-                memcpy(nfs_data.root.data, fh.data, fh.size);
+                memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
        }
+        nfs_free_fhandle(request.fh);
+out:
        return status;
 }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a12c45b65dd4..a3654e57b589 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -60,16 +60,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 {
        struct nfs_page         *req;
-        for (;;) {
+        /* try to allocate the request struct */
-                /* try to allocate the request struct */
+        req = nfs_page_alloc();
-                req = nfs_page_alloc();
+        if (req == NULL)
-                if (req != NULL)
+                return ERR_PTR(-ENOMEM);
-                        break;
-                if (fatal_signal_pending(current))
-                        return ERR_PTR(-ERESTARTSYS);
-                yield();
-        }
        /* Initialize the request struct. Initially, we assume a
         * long write-back delay. This will be adjusted in
@@ -112,12 +106,10 @@ void nfs_unlock_request(struct nfs_page *req)
 */
 int nfs_set_page_tag_locked(struct nfs_page *req)
 {
-        struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
        if (!nfs_lock_request_dontget(req))
                return 0;
        if (req->wb_page != NULL)
-                radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+                radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
        return 1;
 }
@@ -126,10 +118,10 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
 */
 void nfs_clear_page_tag_locked(struct nfs_page *req)
 {
-        struct inode *inode = req->wb_context->path.dentry->d_inode;
-        struct nfs_inode *nfsi = NFS_I(inode);
        if (req->wb_page != NULL) {
+                struct inode *inode = req->wb_context->path.dentry->d_inode;
+                struct nfs_inode *nfsi = NFS_I(inode);
                spin_lock(&inode->i_lock);
                radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
                nfs_unlock_request(req);
@@ -142,16 +134,22 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
 * nfs_clear_request - Free up all resources allocated to the request
 * @req:
 *
- * Release page resources associated with a write request after it
+ * Release page and open context resources associated with a read/write
- * has completed.
+ * request after it has completed.
 */
 void nfs_clear_request(struct nfs_page *req)
 {
        struct page *page = req->wb_page;
+        struct nfs_open_context *ctx = req->wb_context;
        if (page != NULL) {
                page_cache_release(page);
                req->wb_page = NULL;
        }
+        if (ctx != NULL) {
+                put_nfs_open_context(ctx);
+                req->wb_context = NULL;
+        }
 }
@@ -165,9 +163,8 @@ static void nfs_free_request(struct kref *kref)
 {
        struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
-        /* Release struct file or cached credential */
+        /* Release struct file and open context */
        nfs_clear_request(req);
-        put_nfs_open_context(req->wb_context);
        nfs_page_free(req);
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c752d944fe9e..611bec22f552 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -29,7 +29,6 @@
 #include <linux/types.h>
 #include <linux/param.h>
-#include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
@@ -225,35 +224,60 @@ static int nfs_proc_readlink(struct inode *inode, struct page *page,
        return status;
 }
+struct nfs_createdata {
+        struct nfs_createargs arg;
+        struct nfs_diropok res;
+        struct nfs_fh fhandle;
+        struct nfs_fattr fattr;
+};
+static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir,
+                struct dentry *dentry, struct iattr *sattr)
+{
+        struct nfs_createdata *data;
+        data = kmalloc(sizeof(*data), GFP_KERNEL);
+        if (data != NULL) {
+                data->arg.fh = NFS_FH(dir);
+                data->arg.name = dentry->d_name.name;
+                data->arg.len = dentry->d_name.len;
+                data->arg.sattr = sattr;
+                nfs_fattr_init(&data->fattr);
+                data->fhandle.size = 0;
+                data->res.fh = &data->fhandle;
+                data->res.fattr = &data->fattr;
+        }
+        return data;
+};
+static void nfs_free_createdata(const struct nfs_createdata *data)
+{
+        kfree(data);
+}
 static int
 nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                int flags, struct nameidata *nd)
 {
-        struct nfs_fh           fhandle;
+        struct nfs_createdata *data;
-        struct nfs_fattr        fattr;
-        struct nfs_createargs   arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr
-        };
-        struct nfs_diropok      res = {
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_CREATE],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
-        nfs_fattr_init(&fattr);
        dprintk("NFS call  create %s\n", dentry->d_name.name);
+        data = nfs_alloc_createdata(dir, dentry, sattr);
+        if (data == NULL)
+                goto out;
+        msg.rpc_argp = &data->arg;
+        msg.rpc_resp = &data->res;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
        if (status == 0)
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+        nfs_free_createdata(data);
+out:
        dprintk("NFS reply create: %d\n", status);
        return status;
 }
@@ -265,24 +289,12 @@ static int
 nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
               dev_t rdev)
 {
-        struct nfs_fh fhandle;
+        struct nfs_createdata *data;
-        struct nfs_fattr fattr;
-        struct nfs_createargs   arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr
-        };
-        struct nfs_diropok      res = {
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_CREATE],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
        };
-        int status, mode;
+        umode_t mode;
+        int status = -ENOMEM;
        dprintk("NFS call  mknod %s\n", dentry->d_name.name);
@@ -295,17 +307,24 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
        }
-        nfs_fattr_init(&fattr);
+        data = nfs_alloc_createdata(dir, dentry, sattr);
+        if (data == NULL)
+                goto out;
+        msg.rpc_argp = &data->arg;
+        msg.rpc_resp = &data->res;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
        if (status == -EINVAL && S_ISFIFO(mode)) {
                sattr->ia_mode = mode;
-                nfs_fattr_init(&fattr);
+                nfs_fattr_init(data->res.fattr);
                status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        }
        if (status == 0)
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+        nfs_free_createdata(data);
+out:
        dprintk("NFS reply mknod: %d\n", status);
        return status;
 }
@@ -399,8 +418,8 @@ static int
 nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
                 unsigned int len, struct iattr *sattr)
 {
-        struct nfs_fh fhandle;
+        struct nfs_fh *fh;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
        struct nfs_symlinkargs  arg = {
                .fromfh         = NFS_FH(dir),
                .fromname       = dentry->d_name.name,
@@ -413,12 +432,18 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
                .rpc_proc       = &nfs_procedures[NFSPROC_SYMLINK],
                .rpc_argp       = &arg,
        };
-        int                     status;
+        int status = -ENAMETOOLONG;
+        dprintk("NFS call  symlink %s\n", dentry->d_name.name);
        if (len > NFS2_MAXPATHLEN)
-                return -ENAMETOOLONG;
+                goto out;
-        dprintk("NFS call  symlink %s\n", dentry->d_name.name);
+        fh = nfs_alloc_fhandle();
+        fattr = nfs_alloc_fattr();
+        status = -ENOMEM;
+        if (fh == NULL || fattr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
@@ -428,12 +453,12 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
         * filehandle size to zero indicates to nfs_instantiate that it
         * should fill in the data with a LOOKUP call on the wire.
         */
-        if (status == 0) {
+        if (status == 0)
-                nfs_fattr_init(&fattr);
+                status = nfs_instantiate(dentry, fh, fattr);
-                fhandle.size = 0;
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
-        }
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fh);
+out:
        dprintk("NFS reply symlink: %d\n", status);
        return status;
 }
@@ -441,31 +466,25 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
 static int
 nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 {
-        struct nfs_fh fhandle;
+        struct nfs_createdata *data;
-        struct nfs_fattr fattr;
-        struct nfs_createargs   arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr
-        };
-        struct nfs_diropok      res = {
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_MKDIR],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
-        nfs_fattr_init(&fattr);
+        data = nfs_alloc_createdata(dir, dentry, sattr);
+        if (data == NULL)
+                goto out;
+        msg.rpc_argp = &data->arg;
+        msg.rpc_resp = &data->res;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
        if (status == 0)
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+        nfs_free_createdata(data);
+out:
        dprintk("NFS reply mkdir: %d\n", status);
        return status;
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index db9b360ae19d..6e2b06e6ca79 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -40,7 +40,7 @@ static mempool_t *nfs_rdata_mempool;
 struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 {
-        struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS);
+        struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL);
        if (p) {
                memset(p, 0, sizeof(*p));
@@ -50,7 +50,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
-                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
                        if (!p->pagevec) {
                                mempool_free(p, nfs_rdata_mempool);
                                p = NULL;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1afee4eea77..2f8b1157daa2 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -48,6 +48,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <net/ipv6.h>
 #include <linux/netdevice.h>
 #include <linux/nfs_xdr.h>
@@ -140,7 +141,6 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_resvport, "resvport" },
        { Opt_noresvport, "noresvport" },
        { Opt_fscache, "fsc" },
-        { Opt_fscache_uniq, "fsc=%s" },
        { Opt_nofscache, "nofsc" },
        { Opt_port, "port=%s" },
@@ -170,6 +170,7 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_mountaddr, "mountaddr=%s" },
        { Opt_lookupcache, "lookupcache=%s" },
+        { Opt_fscache_uniq, "fsc=%s" },
        { Opt_err, NULL }
 };
@@ -422,15 +423,19 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        unsigned char blockbits;
        unsigned long blockres;
        struct nfs_fh *fh = NFS_FH(dentry->d_inode);
-        struct nfs_fattr fattr;
+        struct nfs_fsstat res;
-        struct nfs_fsstat res = {
+        int error = -ENOMEM;
-                        .fattr = &fattr,
-        };
+        res.fattr = nfs_alloc_fattr();
-        int error;
+        if (res.fattr == NULL)
+                goto out_err;
        error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
+        nfs_free_fattr(res.fattr);
        if (error < 0)
                goto out_err;
        buf->f_type = NFS_SUPER_MAGIC;
        /*
@@ -1045,14 +1050,6 @@ static int nfs_parse_mount_options(char *raw,
                        kfree(mnt->fscache_uniq);
                        mnt->fscache_uniq = NULL;
                        break;
-                case Opt_fscache_uniq:
-                        string = match_strdup(args);
-                        if (!string)
-                                goto out_nomem;
-                        kfree(mnt->fscache_uniq);
-                        mnt->fscache_uniq = string;
-                        mnt->options |= NFS_OPTION_FSCACHE;
-                        break;
                /*
                 * options that take numeric values
@@ -1383,6 +1380,14 @@ static int nfs_parse_mount_options(char *raw,
                                        return 0;
                        };
                        break;
+                case Opt_fscache_uniq:
+                        string = match_strdup(args);
+                        if (string == NULL)
+                                goto out_nomem;
+                        kfree(mnt->fscache_uniq);
+                        mnt->fscache_uniq = string;
+                        mnt->options |= NFS_OPTION_FSCACHE;
+                        break;
                /*
                 * Special options
@@ -2171,7 +2176,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        int error = -ENOMEM;
        data = nfs_alloc_parsed_mount_data(3);
-        mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+        mntfh = nfs_alloc_fhandle();
        if (data == NULL || mntfh == NULL)
                goto out_free_fh;
@@ -2186,6 +2191,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        if (data->version == 4) {
                error = nfs4_try_mount(flags, dev_name, data, mnt);
                kfree(data->client_address);
+                kfree(data->nfs_server.export_path);
                goto out;
        }
 #endif  /* CONFIG_NFS_V4 */
@@ -2214,7 +2220,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        } else {
                error = nfs_bdi_register(server);
                if (error)
-                        goto error_splat_super;
+                        goto error_splat_bdi;
        }
        if (!s->s_root) {
@@ -2245,7 +2251,7 @@ out:
        kfree(data->fscache_uniq);
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
-        kfree(mntfh);
+        nfs_free_fhandle(mntfh);
        kfree(data);
        return error;
@@ -2256,6 +2262,9 @@ out_err_nosb:
 error_splat_root:
        dput(mntroot);
 error_splat_super:
+        if (server && !s->s_root)
+                bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
        deactivate_locked_super(s);
        goto out;
 }
@@ -2326,7 +2335,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
        } else {
                error = nfs_bdi_register(server);
                if (error)
-                        goto error_splat_super;
+                        goto error_splat_bdi;
        }
        if (!s->s_root) {
@@ -2363,6 +2372,9 @@ out_err_noserver:
        return error;
 error_splat_super:
+        if (server && !s->s_root)
+                bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
        deactivate_locked_super(s);
        dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
        return error;
@@ -2548,7 +2560,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
        };
        int error = -ENOMEM;
-        mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+        mntfh = nfs_alloc_fhandle();
        if (data == NULL || mntfh == NULL)
                goto out_free_fh;
@@ -2578,7 +2590,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
        } else {
                error = nfs_bdi_register(server);
                if (error)
-                        goto error_splat_super;
+                        goto error_splat_bdi;
        }
        if (!s->s_root) {
@@ -2606,7 +2618,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
 out:
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
-        kfree(mntfh);
+        nfs_free_fhandle(mntfh);
        return error;
 out_free:
@@ -2616,6 +2628,9 @@ out_free:
 error_splat_root:
        dput(mntroot);
 error_splat_super:
+        if (server && !s->s_root)
+                bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
        deactivate_locked_super(s);
        goto out;
 }
@@ -2647,7 +2662,7 @@ static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
        devname = nfs_path(path->mnt->mnt_devname,
                        path->mnt->mnt_root, path->dentry,
                        page, PAGE_SIZE);
-        if (devname == NULL)
+        if (IS_ERR(devname))
                goto out_freepage;
        tmp = kstrdup(devname, GFP_KERNEL);
        if (tmp == NULL)
@@ -2658,41 +2673,120 @@ out_freepage:
        free_page((unsigned long)page);
 }
+struct nfs_referral_count {
+        struct list_head list;
+        const struct task_struct *task;
+        unsigned int referral_count;
+};
+static LIST_HEAD(nfs_referral_count_list);
+static DEFINE_SPINLOCK(nfs_referral_count_list_lock);
+static struct nfs_referral_count *nfs_find_referral_count(void)
+{
+        struct nfs_referral_count *p;
+        list_for_each_entry(p, &nfs_referral_count_list, list) {
+                if (p->task == current)
+                        return p;
+        }
+        return NULL;
+}
+#define NFS_MAX_NESTED_REFERRALS 2
+static int nfs_referral_loop_protect(void)
+{
+        struct nfs_referral_count *p, *new;
+        int ret = -ENOMEM;
+        new = kmalloc(sizeof(*new), GFP_KERNEL);
+        if (!new)
+                goto out;
+        new->task = current;
+        new->referral_count = 1;
+        ret = 0;
+        spin_lock(&nfs_referral_count_list_lock);
+        p = nfs_find_referral_count();
+        if (p != NULL) {
+                if (p->referral_count >= NFS_MAX_NESTED_REFERRALS)
+                        ret = -ELOOP;
+                else
+                        p->referral_count++;
+        } else {
+                list_add(&new->list, &nfs_referral_count_list);
+                new = NULL;
+        }
+        spin_unlock(&nfs_referral_count_list_lock);
+        kfree(new);
+out:
+        return ret;
+}
+static void nfs_referral_loop_unprotect(void)
+{
+        struct nfs_referral_count *p;
+        spin_lock(&nfs_referral_count_list_lock);
+        p = nfs_find_referral_count();
+        p->referral_count--;
+        if (p->referral_count == 0)
+                list_del(&p->list);
+        else
+                p = NULL;
+        spin_unlock(&nfs_referral_count_list_lock);
+        kfree(p);
+}
 static int nfs_follow_remote_path(struct vfsmount *root_mnt,
                const char *export_path, struct vfsmount *mnt_target)
 {
+        struct nameidata *nd = NULL;
        struct mnt_namespace *ns_private;
-        struct nameidata nd;
        struct super_block *s;
        int ret;
+        nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+        if (nd == NULL)
+                return -ENOMEM;
        ns_private = create_mnt_ns(root_mnt);
        ret = PTR_ERR(ns_private);
        if (IS_ERR(ns_private))
                goto out_mntput;
+        ret = nfs_referral_loop_protect();
+        if (ret != 0)
+                goto out_put_mnt_ns;
        ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
-                        export_path, LOOKUP_FOLLOW, &nd);
+                        export_path, LOOKUP_FOLLOW, nd);
+        nfs_referral_loop_unprotect();
        put_mnt_ns(ns_private);
        if (ret != 0)
                goto out_err;
-        s = nd.path.mnt->mnt_sb;
+        s = nd->path.mnt->mnt_sb;
        atomic_inc(&s->s_active);
        mnt_target->mnt_sb = s;
-        mnt_target->mnt_root = dget(nd.path.dentry);
+        mnt_target->mnt_root = dget(nd->path.dentry);
        /* Correct the device pathname */
-        nfs_fix_devname(&nd.path, mnt_target);
+        nfs_fix_devname(&nd->path, mnt_target);
-        path_put(&nd.path);
+        path_put(&nd->path);
+        kfree(nd);
        down_write(&s->s_umount);
        return 0;
+out_put_mnt_ns:
+        put_mnt_ns(ns_private);
 out_mntput:
        mntput(root_mnt);
 out_err:
+        kfree(nd);
        return ret;
 }
@@ -2811,7 +2905,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
        } else {
                error = nfs_bdi_register(server);
                if (error)
-                        goto error_splat_super;
+                        goto error_splat_bdi;
        }
        if (!s->s_root) {
@@ -2847,6 +2941,9 @@ out_err_noserver:
        return error;
 error_splat_super:
+        if (server && !s->s_root)
+                bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
        deactivate_locked_super(s);
        dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
        return error;
@@ -2860,17 +2957,21 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
        struct super_block *s;
        struct nfs_server *server;
        struct dentry *mntroot;
-        struct nfs_fh mntfh;
+        struct nfs_fh *mntfh;
        int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
        struct nfs_sb_mountdata sb_mntdata = {
                .mntflags = flags,
        };
-        int error;
+        int error = -ENOMEM;
        dprintk("--> nfs4_referral_get_sb()\n");
+        mntfh = nfs_alloc_fhandle();
+        if (mntfh == NULL)
+                goto out_err_nofh;
        /* create a new volume representation */
-        server = nfs4_create_referral_server(data, &mntfh);
+        server = nfs4_create_referral_server(data, mntfh);
        if (IS_ERR(server)) {
                error = PTR_ERR(server);
                goto out_err_noserver;
@@ -2893,7 +2994,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
        } else {
                error = nfs_bdi_register(server);
                if (error)
-                        goto error_splat_super;
+                        goto error_splat_bdi;
        }
        if (!s->s_root) {
@@ -2902,7 +3003,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
                nfs_fscache_get_super_cookie(s, NULL, data);
        }
-        mntroot = nfs4_get_root(s, &mntfh);
+        mntroot = nfs4_get_root(s, mntfh);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -2919,17 +3020,24 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
        security_sb_clone_mnt_opts(data->sb, s);
+        nfs_free_fhandle(mntfh);
        dprintk("<-- nfs4_referral_get_sb() = 0\n");
        return 0;
 out_err_nosb:
        nfs_free_server(server);
 out_err_noserver:
+        nfs_free_fhandle(mntfh);
+out_err_nofh:
        dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
        return error;
 error_splat_super:
+        if (server && !s->s_root)
+                bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
        deactivate_locked_super(s);
+        nfs_free_fhandle(mntfh);
        dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
        return error;
 }
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 2ea9e5c27e55..05c9e02f4153 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -19,7 +19,6 @@
 #include <linux/pagemap.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/namei.h>
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 6da3d3ff6edd..a2242af6a17d 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -23,6 +23,7 @@ struct nfs_unlinkdata {
        struct nfs_removeres res;
        struct inode *dir;
        struct rpc_cred *cred;
+        struct nfs_fattr dir_attr;
 };
 /**
@@ -169,7 +170,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
        }
        nfs_sb_active(dir->i_sb);
        data->args.fh = NFS_FH(dir);
-        nfs_fattr_init(&data->res.dir_attr);
+        nfs_fattr_init(data->res.dir_attr);
        NFS_PROTO(dir)->unlink_setup(&msg, dir);
@@ -259,6 +260,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
                goto out_free;
        }
        data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
+        data->res.dir_attr = &data->dir_attr;
        status = -EBUSY;
        spin_lock(&dentry->d_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53ff70e23993..3aea3ca98ab7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -201,6 +201,7 @@ static int nfs_set_page_writeback(struct page *page)
                struct inode *inode = page->mapping->host;
                struct nfs_server *nfss = NFS_SERVER(inode);
+                page_cache_get(page);
                if (atomic_long_inc_return(&nfss->writeback) >
                                NFS_CONGESTION_ON_THRESH) {
                        set_bdi_congested(&nfss->backing_dev_info,
@@ -216,6 +217,7 @@ static void nfs_end_page_writeback(struct page *page)
        struct nfs_server *nfss = NFS_SERVER(inode);
        end_page_writeback(page);
+        page_cache_release(page);
        if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
                clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
@@ -421,6 +423,7 @@ static void
 nfs_mark_request_dirty(struct nfs_page *req)
 {
        __set_page_dirty_nobuffers(req->wb_page);
+        __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
 }
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -660,9 +663,11 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
        req = nfs_setup_write_request(ctx, page, offset, count);
        if (IS_ERR(req))
                return PTR_ERR(req);
+        nfs_mark_request_dirty(req);
        /* Update file length */
        nfs_grow_file(page, offset, count);
        nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+        nfs_mark_request_dirty(req);
        nfs_clear_page_tag_locked(req);
        return 0;
 }
@@ -739,8 +744,6 @@ int nfs_updatepage(struct file *file, struct page *page,
        status = nfs_writepage_setup(ctx, page, offset, count);
        if (status < 0)
                nfs_set_pageerror(page);
-        else
-                __set_page_dirty_nobuffers(page);
        dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
                        status, (long long)i_size_read(inode));
@@ -749,13 +752,12 @@ int nfs_updatepage(struct file *file, struct page *page,
 static void nfs_writepage_release(struct nfs_page *req)
 {
+        struct page *page = req->wb_page;
-        if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
+        if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req))
-                nfs_end_page_writeback(req->wb_page);
                nfs_inode_remove_request(req);
-        } else
-                nfs_end_page_writeback(req->wb_page);
        nfs_clear_page_tag_locked(req);
+        nfs_end_page_writeback(page);
 }
 static int flush_task_priority(int how)
@@ -779,7 +781,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
                int how)
 {
        struct inode *inode = req->wb_context->path.dentry->d_inode;
-        int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
        int priority = flush_task_priority(how);
        struct rpc_task *task;
        struct rpc_message msg = {
@@ -794,9 +795,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
                .callback_ops = call_ops,
                .callback_data = data,
                .workqueue = nfsiod_workqueue,
-                .flags = flags,
+                .flags = RPC_TASK_ASYNC,
                .priority = priority,
        };
+        int ret = 0;
        /* Set up the RPC argument and reply structs
         * NB: take care not to mess about with data->commit et al. */
@@ -835,10 +837,18 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
                (unsigned long long)data->args.offset);
        task = rpc_run_task(&task_setup_data);
-        if (IS_ERR(task))
+        if (IS_ERR(task)) {
-                return PTR_ERR(task);
+                ret = PTR_ERR(task);
+                goto out;
+        }
+        if (how & FLUSH_SYNC) {
+                ret = rpc_wait_for_completion_task(task);
+                if (ret == 0)
+                        ret = task->tk_status;
+        }
        rpc_put_task(task);
-        return 0;
+out:
+        return ret;
 }
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -847,9 +857,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 */
 static void nfs_redirty_request(struct nfs_page *req)
 {
+        struct page *page = req->wb_page;
        nfs_mark_request_dirty(req);
-        nfs_end_page_writeback(req->wb_page);
        nfs_clear_page_tag_locked(req);
+        nfs_end_page_writeback(page);
 }
 /*
@@ -1084,16 +1096,15 @@ static void nfs_writeback_release_full(void *calldata)
                if (nfs_write_need_commit(data)) {
                        memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
                        nfs_mark_request_commit(req);
-                        nfs_end_page_writeback(page);
                        dprintk(" marked for commit\n");
                        goto next;
                }
                dprintk(" OK\n");
 remove_request:
-                nfs_end_page_writeback(page);
                nfs_inode_remove_request(req);
        next:
                nfs_clear_page_tag_locked(req);
+                nfs_end_page_writeback(page);
        }
        nfs_writedata_release(calldata);
 }
@@ -1190,6 +1201,25 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
+{
+        if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
+                return 1;
+        if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags,
+                                NFS_INO_COMMIT, nfs_wait_bit_killable,
+                                TASK_KILLABLE))
+                return 1;
+        return 0;
+}
+static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+{
+        clear_bit(NFS_INO_COMMIT, &nfsi->flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
+}
 static void nfs_commitdata_release(void *data)
 {
        struct nfs_write_data *wdata = data;
@@ -1207,7 +1237,6 @@ static int nfs_commit_rpcsetup(struct list_head *head,
 {
        struct nfs_page *first = nfs_list_entry(head->next);
        struct inode *inode = first->wb_context->path.dentry->d_inode;
-        int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
        int priority = flush_task_priority(how);
        struct rpc_task *task;
        struct rpc_message msg = {
@@ -1222,7 +1251,7 @@ static int nfs_commit_rpcsetup(struct list_head *head,
                .callback_ops = &nfs_commit_ops,
                .callback_data = data,
                .workqueue = nfsiod_workqueue,
-                .flags = flags,
+                .flags = RPC_TASK_ASYNC,
                .priority = priority,
        };
@@ -1282,6 +1311,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
                                BDI_RECLAIMABLE);
                nfs_clear_page_tag_locked(req);
        }
+        nfs_commit_clear_lock(NFS_I(inode));
        return -ENOMEM;
 }
@@ -1337,6 +1367,7 @@ static void nfs_commit_release(void *calldata)
        next:
                nfs_clear_page_tag_locked(req);
        }
+        nfs_commit_clear_lock(NFS_I(data->inode));
        nfs_commitdata_release(calldata);
 }
@@ -1351,8 +1382,11 @@ static const struct rpc_call_ops nfs_commit_ops = {
 static int nfs_commit_inode(struct inode *inode, int how)
 {
        LIST_HEAD(head);
-        int res;
+        int may_wait = how & FLUSH_SYNC;
+        int res = 0;
+        if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
+                goto out;
        spin_lock(&inode->i_lock);
        res = nfs_scan_commit(inode, &head, 0, 0);
        spin_unlock(&inode->i_lock);
@@ -1360,7 +1394,13 @@ static int nfs_commit_inode(struct inode *inode, int how)
                int error = nfs_commit_list(inode, &head, how);
                if (error < 0)
                        return error;
-        }
+                if (may_wait)
+                        wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
+                                        nfs_wait_bit_killable,
+                                        TASK_KILLABLE);
+        } else
+                nfs_commit_clear_lock(NFS_I(inode));
+out:
        return res;
 }
@@ -1432,6 +1472,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
        BUG_ON(!PageLocked(page));
        for (;;) {
+                wait_on_page_writeback(page);
                req = nfs_page_find_request(page);
                if (req == NULL)
                        break;
@@ -1466,30 +1507,18 @@ int nfs_wb_page(struct inode *inode, struct page *page)
                .range_start = range_start,
                .range_end = range_end,
        };
-        struct nfs_page *req;
-        int need_commit;
        int ret;
        while(PagePrivate(page)) {
+                wait_on_page_writeback(page);
                if (clear_page_dirty_for_io(page)) {
                        ret = nfs_writepage_locked(page, &wbc);
                        if (ret < 0)
                                goto out_error;
                }
-                req = nfs_find_and_lock_request(page);
+                ret = sync_inode(inode, &wbc);
-                if (!req)
+                if (ret < 0)
-                        break;
-                if (IS_ERR(req)) {
-                        ret = PTR_ERR(req);
                        goto out_error;
-                }
-                need_commit = test_bit(PG_CLEAN, &req->wb_flags);
-                nfs_clear_page_tag_locked(req);
-                if (need_commit) {
-                        ret = nfs_commit_inode(inode, FLUSH_SYNC);
-                        if (ret < 0)
-                                goto out_error;
-                }
        }
        return 0;
 out_error:
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 04133aacb1e5..fc1c52571c03 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/nfsacl.h>
 #include <linux/nfs3.h>
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a0c4016413f1..c2a4f71d87dd 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -12,6 +12,7 @@
 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
 */
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/module.h>
 #include <linux/exportfs.h>
@@ -258,10 +259,9 @@ static struct cache_detail svc_expkey_cache = {
        .alloc          = expkey_alloc,
 };
-static struct svc_expkey *
+static int
-svc_expkey_lookup(struct svc_expkey *item)
+svc_expkey_hash(struct svc_expkey *item)
 {
-        struct cache_head *ch;
        int hash = item->ek_fsidtype;
        char * cp = (char*)item->ek_fsid;
        int len = key_len(item->ek_fsidtype);
@@ -269,6 +269,14 @@ svc_expkey_lookup(struct svc_expkey *item)
        hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
        hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS);
        hash &= EXPKEY_HASHMASK;
+        return hash;
+}
+static struct svc_expkey *
+svc_expkey_lookup(struct svc_expkey *item)
+{
+        struct cache_head *ch;
+        int hash = svc_expkey_hash(item);
        ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h,
                                 hash);
@@ -282,13 +290,7 @@ static struct svc_expkey *
 svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
 {
        struct cache_head *ch;
-        int hash = new->ek_fsidtype;
+        int hash = svc_expkey_hash(new);
-        char * cp = (char*)new->ek_fsid;
-        int len = key_len(new->ek_fsidtype);
-        hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
-        hash ^= hash_ptr(new->ek_client, EXPKEY_HASHBITS);
-        hash &= EXPKEY_HASHMASK;
        ch = sunrpc_cache_update(&svc_expkey_cache, &new->h,
                                 &old->h, hash);
@@ -737,14 +739,22 @@ struct cache_detail svc_export_cache = {
        .alloc          = svc_export_alloc,
 };
-static struct svc_export *
+static int
-svc_export_lookup(struct svc_export *exp)
+svc_export_hash(struct svc_export *exp)
 {
-        struct cache_head *ch;
        int hash;
        hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS);
        hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS);
        hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS);
+        return hash;
+}
+static struct svc_export *
+svc_export_lookup(struct svc_export *exp)
+{
+        struct cache_head *ch;
+        int hash = svc_export_hash(exp);
        ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h,
                                 hash);
@@ -758,10 +768,7 @@ static struct svc_export *
 svc_export_update(struct svc_export *new, struct svc_export *old)
 {
        struct cache_head *ch;
-        int hash;
+        int hash = svc_export_hash(old);
-        hash = hash_ptr(old->ex_client, EXPORT_HASHBITS);
-        hash ^= hash_ptr(old->ex_path.dentry, EXPORT_HASHBITS);
-        hash ^= hash_ptr(old->ex_path.mnt, EXPORT_HASHBITS);
        ch = sunrpc_cache_update(&svc_export_cache, &new->h,
                                 &old->h,
@@ -1070,9 +1077,9 @@ exp_export(struct nfsctl_export *nxp)
                err = 0;
 finish:
        kfree(new.ex_pathname);
-        if (exp)
+        if (!IS_ERR_OR_NULL(exp))
                exp_put(exp);
-        if (fsid_key && !IS_ERR(fsid_key))
+        if (!IS_ERR_OR_NULL(fsid_key))
                cache_put(&fsid_key->h, &svc_expkey_cache);
        path_put(&path);
 out_put_clp:
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index f20589d2ae27..6aa5590c3679 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -7,6 +7,7 @@
 #include "nfsd.h"
 /* FIXME: nfsacl.h is a broken header */
 #include <linux/nfsacl.h>
+#include <linux/gfp.h>
 #include "cache.h"
 #include "xdr3.h"
 #include "vfs.h"
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index e0c4846bad92..a596e9d987e4 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -7,6 +7,7 @@
 #include "nfsd.h"
 /* FIXME: nfsacl.h is a broken header */
 #include <linux/nfsacl.h>
+#include <linux/gfp.h>
 #include "cache.h"
 #include "xdr3.h"
 #include "vfs.h"
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 88150685df34..e48052615159 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -34,6 +34,7 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
+#include <linux/slab.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs4_acl.h>
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4bc22c763de7..eb78e7e22077 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,8 @@
 */
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/slab.h>
 #include "nfsd.h"
 #include "state.h"
@@ -78,11 +80,6 @@ enum nfs_cb_opnum4 {
                                        cb_sequence_dec_sz +            \
                                        op_dec_sz)
-struct nfs4_rpc_args {
-        void                            *args_op;
-        struct nfsd4_cb_sequence        args_seq;
-};
 /*
 * Generic encode routines from fs/nfs/nfs4xdr.c
 */
@@ -427,13 +424,19 @@ static struct rpc_procinfo     nfs4_cb_procedures[] = {
 };
 static struct rpc_version       nfs_cb_version4 = {
+/*
+ * Note on the callback rpc program version number: despite language in rfc
+ * 5661 section 18.36.3 requiring servers to use 4 in this field, the
+ * official xdr descriptions for both 4.0 and 4.1 specify version 1, and
+ * in practice that appears to be what implementations use.  The section
+ * 18.36.3 language is expected to be fixed in an erratum.
+ */
        .number                 = 1,
        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
        .procs                  = nfs4_cb_procedures
 };
 static struct rpc_version *     nfs_cb_version[] = {
-        NULL,
        &nfs_cb_version4,
 };
@@ -455,15 +458,14 @@ static struct rpc_program cb_program = {
 static int max_cb_time(void)
 {
-        return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ;
+        return max(nfsd4_lease/10, (time_t)1) * HZ;
 }
 /* Reference counting, callback cleanup, etc., all look racy as heck.
- * And why is cb_set an atomic? */
+ * And why is cl_cb_set an atomic? */
-int setup_callback_client(struct nfs4_client *clp)
+int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
 {
-        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
        struct rpc_timeout      timeparms = {
                .to_initval     = max_cb_time(),
                .to_retries     = 0,
@@ -475,7 +477,7 @@ int setup_callback_client(struct nfs4_client *clp)
                .timeout        = &timeparms,
                .program        = &cb_program,
                .prognumber     = cb->cb_prog,
-                .version        = nfs_cb_version[1]->number,
+                .version        = 0,
                .authflavor     = clp->cl_flavor,
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
                .client_name    = clp->cl_principal,
@@ -485,7 +487,7 @@ int setup_callback_client(struct nfs4_client *clp)
        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
                return -EINVAL;
        if (cb->cb_minorversion) {
-                args.bc_xprt = clp->cl_cb_xprt;
+                args.bc_xprt = cb->cb_xprt;
                args.protocol = XPRT_TRANSPORT_BC_TCP;
        }
        /* Create RPC client */
@@ -495,7 +497,7 @@ int setup_callback_client(struct nfs4_client *clp)
                        PTR_ERR(client));
                return PTR_ERR(client);
        }
-        cb->cb_client = client;
+        nfsd4_set_callback_client(clp, client);
        return 0;
 }
@@ -513,8 +515,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
        if (task->tk_status)
                warn_no_callback_path(clp, task->tk_status);
        else
-                atomic_set(&clp->cl_cb_conn.cb_set, 1);
+                atomic_set(&clp->cl_cb_set, 1);
-        put_nfs4_client(clp);
 }
 static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -536,7 +537,6 @@ int set_callback_cred(void)
 void do_probe_callback(struct nfs4_client *clp)
 {
-        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
                .rpc_argp       = clp,
@@ -544,34 +544,27 @@ void do_probe_callback(struct nfs4_client *clp)
        };
        int status;
-        status = rpc_call_async(cb->cb_client, &msg,
+        status = rpc_call_async(clp->cl_cb_client, &msg,
                                RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
                                &nfsd4_cb_probe_ops, (void *)clp);
-        if (status) {
+        if (status)
                warn_no_callback_path(clp, status);
-                put_nfs4_client(clp);
-        }
 }
 /*
 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
 */
-void
+void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
-nfsd4_probe_callback(struct nfs4_client *clp)
 {
        int status;
-        BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set));
+        BUG_ON(atomic_read(&clp->cl_cb_set));
-        status = setup_callback_client(clp);
+        status = setup_callback_client(clp, cb);
        if (status) {
                warn_no_callback_path(clp, status);
                return;
        }
-        /* the task holds a reference to the nfs4_client struct */
-        atomic_inc(&clp->cl_count);
        do_probe_callback(clp);
 }
@@ -657,18 +650,32 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
        }
 }
 static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_delegation *dp = calldata;
        struct nfs4_client *clp = dp->dl_client;
+        struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
        nfsd4_cb_done(task, calldata);
+        if (current_rpc_client == NULL) {
+                /* We're shutting down; give up. */
+                /* XXX: err, or is it ok just to fall through
+                 * and rpc_restart_call? */
+                return;
+        }
        switch (task->tk_status) {
        case -EIO:
                /* Network partition? */
-                atomic_set(&clp->cl_cb_conn.cb_set, 0);
+                atomic_set(&clp->cl_cb_set, 0);
                warn_no_callback_path(clp, task->tk_status);
+                if (current_rpc_client != task->tk_client) {
+                        /* queue a callback on the new connection: */
+                        nfsd4_cb_recall(dp);
+                        return;
+                }
        case -EBADHANDLE:
        case -NFS4ERR_BAD_STATEID:
                /* Race: client probably got cb_recall
@@ -676,7 +683,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
                break;
        default:
                /* success, or error we can't handle */
-                goto done;
+                return;
        }
        if (dp->dl_retries--) {
                rpc_delay(task, 2*HZ);
@@ -684,20 +691,16 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
                rpc_restart_call(task);
                return;
        } else {
-                atomic_set(&clp->cl_cb_conn.cb_set, 0);
+                atomic_set(&clp->cl_cb_set, 0);
                warn_no_callback_path(clp, task->tk_status);
        }
-done:
-        kfree(task->tk_msg.rpc_argp);
 }
 static void nfsd4_cb_recall_release(void *calldata)
 {
        struct nfs4_delegation *dp = calldata;
-        struct nfs4_client *clp = dp->dl_client;
        nfs4_put_delegation(dp);
-        put_nfs4_client(clp);
 }
 static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -706,33 +709,75 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
        .rpc_release = nfsd4_cb_recall_release,
 };
+static struct workqueue_struct *callback_wq;
+int nfsd4_create_callback_queue(void)
+{
+        callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
+        if (!callback_wq)
+                return -ENOMEM;
+        return 0;
+}
+void nfsd4_destroy_callback_queue(void)
+{
+        destroy_workqueue(callback_wq);
+}
+/* must be called under the state lock */
+void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
+{
+        struct rpc_clnt *old = clp->cl_cb_client;
+        clp->cl_cb_client = new;
+        /*
+         * After this, any work that saw the old value of cl_cb_client will
+         * be gone:
+         */
+        flush_workqueue(callback_wq);
+        /* So we can safely shut it down: */
+        if (old)
+                rpc_shutdown_client(old);
+}
 /*
 * called with dp->dl_count inc'ed.
 */
-void
+static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
-nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
        struct nfs4_client *clp = dp->dl_client;
-        struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
+        struct rpc_clnt *clnt = clp->cl_cb_client;
-        struct nfs4_rpc_args *args;
+        struct nfs4_rpc_args *args = &dp->dl_recall.cb_args;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
                .rpc_cred = callback_cred
        };
-        int status = -ENOMEM;
+        int status;
+        if (clnt == NULL)
+                return; /* Client is shutting down; give up. */
-        args = kzalloc(sizeof(*args), GFP_KERNEL);
-        if (!args)
-                goto out;
        args->args_op = dp;
        msg.rpc_argp = args;
        dp->dl_retries = 1;
        status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
                                &nfsd4_cb_recall_ops, dp);
-out:
+        if (status)
-        if (status) {
-                kfree(args);
-                put_nfs4_client(clp);
                nfs4_put_delegation(dp);
-        }
+}
+void nfsd4_do_callback_rpc(struct work_struct *w)
+{
+        /* XXX: for now, just send off delegation recall. */
+        /* In future, generalize to handle any sort of callback. */
+        struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work);
+        struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
+        _nfsd4_cb_recall(dp);
+}
+void nfsd4_cb_recall(struct nfs4_delegation *dp)
+{
+        queue_work(callback_wq, &dp->dl_recall.cb_work);
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6e2983b27f3c..c78dbf493424 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
 #include <linux/nfsd_idmap.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 /*
 * Cache entry
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 37514c469846..59ec449b0c7f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -33,6 +33,7 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <linux/file.h>
+#include <linux/slab.h>
 #include "cache.h"
 #include "xdr4.h"
@@ -968,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[];
 static const char *nfsd4_op_name(unsigned opnum);
 /*
- * Enforce NFSv4.1 COMPOUND ordering rules.
+ * Enforce NFSv4.1 COMPOUND ordering rules:
 *
- * TODO:
+ * Also note, enforced elsewhere:
- * - enforce NFS4ERR_NOT_ONLY_OP,
+ *      - SEQUENCE other than as first op results in
- * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+ *        NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
+ *      - BIND_CONN_TO_SESSION must be the only op in its compound
+ *        (Will be enforced in nfsd4_bind_conn_to_session().)
+ *      - DESTROY_SESSION must be the final operation in a compound, if
+ *        sessionid's in SEQUENCE and DESTROY_SESSION are the same.
+ *        (Enforced in nfsd4_destroy_session().)
 */
-static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
+static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
 {
-        if (args->minorversion && args->opcnt > 0) {
+        struct nfsd4_op *op = &args->ops[0];
-                struct nfsd4_op *op = &args->ops[0];
-                return (op->status == nfserr_op_illegal) ||
+        /* These ordering requirements don't apply to NFSv4.0: */
-                       (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
+        if (args->minorversion == 0)
-        }
+                return nfs_ok;
-        return true;
+        /* This is weird, but OK, not our problem: */
+        if (args->opcnt == 0)
+                return nfs_ok;
+        if (op->status == nfserr_op_illegal)
+                return nfs_ok;
+        if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP))
+                return nfserr_op_not_in_session;
+        if (op->opnum == OP_SEQUENCE)
+                return nfs_ok;
+        if (args->opcnt != 1)
+                return nfserr_not_only_op;
+        return nfs_ok;
 }
 /*
@@ -1011,6 +1028,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        resp->rqstp = rqstp;
        resp->cstate.minorversion = args->minorversion;
        resp->cstate.replay_owner = NULL;
+        resp->cstate.session = NULL;
        fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
        fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
        /* Use the deferral mechanism only for NFSv4.0 compounds */
@@ -1023,13 +1041,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        if (args->minorversion > nfsd_supported_minorversion)
                goto out;
-        if (!nfs41_op_ordering_ok(args)) {
+        status = nfs41_check_op_ordering(args);
+        if (status) {
                op = &args->ops[0];
-                op->status = nfserr_sequence_pos;
+                op->status = status;
                goto encode_op;
        }
-        status = nfs_ok;
        while (!status && resp->opcnt < args->opcnt) {
                op = &args->ops[resp->opcnt++];
@@ -1294,6 +1312,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
                .op_name = "OP_SEQUENCE",
        },
+        [OP_RECLAIM_COMPLETE] = {
+                .op_func = (nfsd4op_func)nfsd4_reclaim_complete,
+                .op_flags = ALLOWED_WITHOUT_FH,
+                .op_name = "OP_RECLAIM_COMPLETE",
+        },
 };
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 98fb98e330b4..7a9ae3254a4b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -32,6 +32,7 @@
 */
 #include <linux/file.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/crypto.h>
 #include <linux/sched.h>
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c97fddbd17db..12f7109720c2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -34,6 +34,7 @@
 #include <linux/file.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/swap.h>
 #include <linux/sunrpc/svcauth_gss.h>
@@ -44,8 +45,8 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 /* Globals */
-static time_t lease_time = 90;     /* default lease time */
+time_t nfsd4_lease = 90;     /* default lease time */
-static time_t user_lease_time = 90;
+time_t nfsd4_grace = 90;
 static time_t boot_time;
 static u32 current_ownerid = 1;
 static u32 current_fileid = 1;
@@ -189,7 +190,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        dp->dl_vfs_file = stp->st_vfs_file;
        dp->dl_type = type;
        dp->dl_ident = cb->cb_ident;
-        dp->dl_stateid.si_boot = get_seconds();
+        dp->dl_stateid.si_boot = boot_time;
        dp->dl_stateid.si_stateownerid = current_delegid++;
        dp->dl_stateid.si_fileid = 0;
        dp->dl_stateid.si_generation = 0;
@@ -198,6 +199,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        atomic_set(&dp->dl_count, 1);
        list_add(&dp->dl_perfile, &fp->fi_delegations);
        list_add(&dp->dl_perclnt, &clp->cl_delegations);
+        INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
        return dp;
 }
@@ -248,6 +250,9 @@ unhash_delegation(struct nfs4_delegation *dp)
 * SETCLIENTID state 
 */
+/* client_lock protects the client lru list and session hash table */
+static DEFINE_SPINLOCK(client_lock);
 /* Hash tables for nfs4_clientid state */
 #define CLIENT_HASH_BITS                 4
 #define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
@@ -366,7 +371,6 @@ static void release_openowner(struct nfs4_stateowner *sop)
        nfs4_put_stateowner(sop);
 }
-static DEFINE_SPINLOCK(sessionid_lock);
 #define SESSION_HASH_SIZE       512
 static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
@@ -564,10 +568,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
        new->se_flags = cses->flags;
        kref_init(&new->se_ref);
-        spin_lock(&sessionid_lock);
+        spin_lock(&client_lock);
        list_add(&new->se_hash, &sessionid_hashtbl[idx]);
        list_add(&new->se_perclnt, &clp->cl_sessions);
-        spin_unlock(&sessionid_lock);
+        spin_unlock(&client_lock);
        status = nfs_ok;
 out:
@@ -578,7 +582,7 @@ out_free:
        goto out;
 }
-/* caller must hold sessionid_lock */
+/* caller must hold client_lock */
 static struct nfsd4_session *
 find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
 {
@@ -601,7 +605,7 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
        return NULL;
 }
-/* caller must hold sessionid_lock */
+/* caller must hold client_lock */
 static void
 unhash_session(struct nfsd4_session *ses)
 {
@@ -609,15 +613,6 @@ unhash_session(struct nfsd4_session *ses)
        list_del(&ses->se_perclnt);
 }
-static void
-release_session(struct nfsd4_session *ses)
-{
-        spin_lock(&sessionid_lock);
-        unhash_session(ses);
-        spin_unlock(&sessionid_lock);
-        nfsd4_put_session(ses);
-}
 void
 free_session(struct kref *kref)
 {
@@ -633,9 +628,18 @@ free_session(struct kref *kref)
        kfree(ses);
 }
+/* must be called under the client_lock */
 static inline void
-renew_client(struct nfs4_client *clp)
+renew_client_locked(struct nfs4_client *clp)
 {
+        if (is_client_expired(clp)) {
+                dprintk("%s: client (clientid %08x/%08x) already expired\n",
+                        __func__,
+                        clp->cl_clientid.cl_boot,
+                        clp->cl_clientid.cl_id);
+                return;
+        }
        /*
        * Move client to the end to the LRU list.
        */
@@ -646,6 +650,14 @@ renew_client(struct nfs4_client *clp)
        clp->cl_time = get_seconds();
 }
+static inline void
+renew_client(struct nfs4_client *clp)
+{
+        spin_lock(&client_lock);
+        renew_client_locked(clp);
+        spin_unlock(&client_lock);
+}
 /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
 static int
 STALE_CLIENTID(clientid_t *clid)
@@ -679,27 +691,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
        return clp;
 }
-static void
-shutdown_callback_client(struct nfs4_client *clp)
-{
-        struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
-        if (clnt) {
-                /*
-                 * Callback threads take a reference on the client, so there
-                 * should be no outstanding callbacks at this point.
-                 */
-                clp->cl_cb_conn.cb_client = NULL;
-                rpc_shutdown_client(clnt);
-        }
-}
 static inline void
 free_client(struct nfs4_client *clp)
 {
-        shutdown_callback_client(clp);
-        if (clp->cl_cb_xprt)
-                svc_xprt_put(clp->cl_cb_xprt);
        if (clp->cl_cred.cr_group_info)
                put_group_info(clp->cl_cred.cr_group_info);
        kfree(clp->cl_principal);
@@ -708,10 +702,34 @@ free_client(struct nfs4_client *clp)
 }
 void
-put_nfs4_client(struct nfs4_client *clp)
+release_session_client(struct nfsd4_session *session)
 {
-        if (atomic_dec_and_test(&clp->cl_count))
+        struct nfs4_client *clp = session->se_client;
+        if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock))
+                return;
+        if (is_client_expired(clp)) {
                free_client(clp);
+                session->se_client = NULL;
+        } else
+                renew_client_locked(clp);
+        spin_unlock(&client_lock);
+        nfsd4_put_session(session);
+}
+/* must be called under the client_lock */
+static inline void
+unhash_client_locked(struct nfs4_client *clp)
+{
+        mark_client_expired(clp);
+        list_del(&clp->cl_lru);
+        while (!list_empty(&clp->cl_sessions)) {
+                struct nfsd4_session  *ses;
+                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+                                 se_perclnt);
+                unhash_session(ses);
+                nfsd4_put_session(ses);
+        }
 }
 static void
@@ -721,9 +739,6 @@ expire_client(struct nfs4_client *clp)
        struct nfs4_delegation *dp;
        struct list_head reaplist;
-        dprintk("NFSD: expire_client cl_count %d\n",
-                            atomic_read(&clp->cl_count));
        INIT_LIST_HEAD(&reaplist);
        spin_lock(&recall_lock);
        while (!list_empty(&clp->cl_delegations)) {
@@ -739,20 +754,20 @@ expire_client(struct nfs4_client *clp)
                list_del_init(&dp->dl_recall_lru);
                unhash_delegation(dp);
        }
-        list_del(&clp->cl_idhash);
-        list_del(&clp->cl_strhash);
-        list_del(&clp->cl_lru);
        while (!list_empty(&clp->cl_openowners)) {
                sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
                release_openowner(sop);
        }
-        while (!list_empty(&clp->cl_sessions)) {
+        nfsd4_set_callback_client(clp, NULL);
-                struct nfsd4_session  *ses;
+        if (clp->cl_cb_conn.cb_xprt)
-                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
-                                 se_perclnt);
+        list_del(&clp->cl_idhash);
-                release_session(ses);
+        list_del(&clp->cl_strhash);
-        }
+        spin_lock(&client_lock);
-        put_nfs4_client(clp);
+        unhash_client_locked(clp);
+        if (atomic_read(&clp->cl_refcount) == 0)
+                free_client(clp);
+        spin_unlock(&client_lock);
 }
 static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
@@ -838,14 +853,15 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        }
        memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
-        atomic_set(&clp->cl_count, 1);
+        atomic_set(&clp->cl_refcount, 0);
-        atomic_set(&clp->cl_cb_conn.cb_set, 0);
+        atomic_set(&clp->cl_cb_set, 0);
        INIT_LIST_HEAD(&clp->cl_idhash);
        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
        INIT_LIST_HEAD(&clp->cl_sessions);
        INIT_LIST_HEAD(&clp->cl_lru);
+        clp->cl_time = get_seconds();
        clear_bit(0, &clp->cl_cb_slot_busy);
        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
        copy_verf(clp, verf);
@@ -876,8 +892,7 @@ add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
        list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]);
        idhashval = clientid_hashval(clp->cl_clientid.cl_id);
        list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]);
-        list_add_tail(&clp->cl_lru, &client_lru);
+        renew_client(clp);
-        clp->cl_time = get_seconds();
 }
 static void
@@ -887,10 +902,9 @@ move_to_confirmed(struct nfs4_client *clp)
        unsigned int strhashval;
        dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
-        list_del_init(&clp->cl_strhash);
        list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
        strhashval = clientstr_hashval(clp->cl_recdir);
-        list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
+        list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
        renew_client(clp);
 }
@@ -1326,15 +1340,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                cs_slot->sl_seqid++; /* from 0 to 1 */
                move_to_confirmed(unconf);
-                /*
-                 * We do not support RDMA or persistent sessions
-                 */
-                cr_ses->flags &= ~SESSION4_PERSIST;
-                cr_ses->flags &= ~SESSION4_RDMA;
                if (cr_ses->flags & SESSION4_BACK_CHAN) {
-                        unconf->cl_cb_xprt = rqstp->rq_xprt;
+                        unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
-                        svc_xprt_get(unconf->cl_cb_xprt);
+                        svc_xprt_get(rqstp->rq_xprt);
                        rpc_copy_addr(
                                (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
                                sa);
@@ -1343,7 +1351,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                                cstate->minorversion;
                        unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
                        unconf->cl_cb_seq_nr = 1;
-                        nfsd4_probe_callback(unconf);
+                        nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
                }
                conf = unconf;
        } else {
@@ -1351,6 +1359,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                goto out;
        }
+        /*
+         * We do not support RDMA or persistent sessions
+         */
+        cr_ses->flags &= ~SESSION4_PERSIST;
+        cr_ses->flags &= ~SESSION4_RDMA;
        status = alloc_init_session(rqstp, conf, cr_ses);
        if (status)
                goto out;
@@ -1368,6 +1382,21 @@ out:
        return status;
 }
+static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
+{
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+        return argp->opcnt == resp->opcnt;
+}
+static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
+{
+        if (!session)
+                return 0;
+        return !memcmp(sid, &session->se_sessionid, sizeof(*sid));
+}
 __be32
 nfsd4_destroy_session(struct svc_rqst *r,
                      struct nfsd4_compound_state *cstate,
@@ -1383,19 +1412,25 @@ nfsd4_destroy_session(struct svc_rqst *r,
         * - Do we need to clear any callback info from previous session?
         */
+        if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
+                if (!nfsd4_last_compound_op(r))
+                        return nfserr_not_only_op;
+        }
        dump_sessionid(__func__, &sessionid->sessionid);
-        spin_lock(&sessionid_lock);
+        spin_lock(&client_lock);
        ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
        if (!ses) {
-                spin_unlock(&sessionid_lock);
+                spin_unlock(&client_lock);
                goto out;
        }
        unhash_session(ses);
-        spin_unlock(&sessionid_lock);
+        spin_unlock(&client_lock);
+        nfs4_lock_state();
        /* wait for callbacks */
-        shutdown_callback_client(ses->se_client);
+        nfsd4_set_callback_client(ses->se_client, NULL);
+        nfs4_unlock_state();
        nfsd4_put_session(ses);
        status = nfs_ok;
 out:
@@ -1416,7 +1451,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        if (resp->opcnt != 1)
                return nfserr_sequence_pos;
-        spin_lock(&sessionid_lock);
+        spin_lock(&client_lock);
        status = nfserr_badsession;
        session = find_in_sessionid_hashtbl(&seq->sessionid);
        if (!session)
@@ -1455,23 +1490,47 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        cstate->slot = slot;
        cstate->session = session;
-        /* Hold a session reference until done processing the compound:
-         * nfsd4_put_session called only if the cstate slot is set.
-         */
-        nfsd4_get_session(session);
 out:
-        spin_unlock(&sessionid_lock);
+        /* Hold a session reference until done processing the compound. */
-        /* Renew the clientid on success and on replay */
        if (cstate->session) {
-                nfs4_lock_state();
+                nfsd4_get_session(cstate->session);
-                renew_client(session->se_client);
+                atomic_inc(&session->se_client->cl_refcount);
-                nfs4_unlock_state();
        }
+        spin_unlock(&client_lock);
        dprintk("%s: return %d\n", __func__, ntohl(status));
        return status;
 }
 __be32
+nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
+{
+        if (rc->rca_one_fs) {
+                if (!cstate->current_fh.fh_dentry)
+                        return nfserr_nofilehandle;
+                /*
+                 * We don't take advantage of the rca_one_fs case.
+                 * That's OK, it's optional, we can safely ignore it.
+                 */
+                 return nfs_ok;
+        }
+        nfs4_lock_state();
+        if (is_client_expired(cstate->session->se_client)) {
+                nfs4_unlock_state();
+                /*
+                 * The following error isn't really legal.
+                 * But we only get here if the client just explicitly
+                 * destroyed the client.  Surely it no longer cares what
+                 * error it gets back on an operation for the dead
+                 * client.
+                 */
+                return nfserr_stale_clientid;
+        }
+        nfsd4_create_clid_dir(cstate->session->se_client);
+        nfs4_unlock_state();
+        return nfs_ok;
+}
+__be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_setclientid *setclid)
 {
@@ -1630,9 +1689,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
                        status = nfserr_clid_inuse;
                else {
-                        /* XXX: We just turn off callbacks until we can handle
+                        atomic_set(&conf->cl_cb_set, 0);
-                          * change request correctly. */
+                        nfsd4_probe_callback(conf, &unconf->cl_cb_conn);
-                        atomic_set(&conf->cl_cb_conn.cb_set, 0);
                        expire_client(unconf);
                        status = nfs_ok;
@@ -1666,7 +1724,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        }
                        move_to_confirmed(unconf);
                        conf = unconf;
-                        nfsd4_probe_callback(conf);
+                        nfsd4_probe_callback(conf, &conf->cl_cb_conn);
                        status = nfs_ok;
                }
        } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -1699,12 +1757,12 @@ alloc_init_file(struct inode *ino)
                INIT_LIST_HEAD(&fp->fi_hash);
                INIT_LIST_HEAD(&fp->fi_stateids);
                INIT_LIST_HEAD(&fp->fi_delegations);
-                spin_lock(&recall_lock);
-                list_add(&fp->fi_hash, &file_hashtbl[hashval]);
-                spin_unlock(&recall_lock);
                fp->fi_inode = igrab(ino);
                fp->fi_id = current_fileid++;
                fp->fi_had_conflict = false;
+                spin_lock(&recall_lock);
+                list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+                spin_unlock(&recall_lock);
                return fp;
        }
        return NULL;
@@ -1826,7 +1884,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
        stp->st_stateowner = sop;
        get_nfs4_file(fp);
        stp->st_file = fp;
-        stp->st_stateid.si_boot = get_seconds();
+        stp->st_stateid.si_boot = boot_time;
        stp->st_stateid.si_stateownerid = sop->so_id;
        stp->st_stateid.si_fileid = fp->fi_id;
        stp->st_stateid.si_generation = 0;
@@ -2027,7 +2085,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
         * lock) we know the server hasn't removed the lease yet, we know
         * it's safe to take a reference: */
        atomic_inc(&dp->dl_count);
-        atomic_inc(&dp->dl_client->cl_count);
        spin_lock(&recall_lock);
        list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
@@ -2346,7 +2403,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
        struct nfs4_delegation *dp;
        struct nfs4_stateowner *sop = stp->st_stateowner;
-        struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn;
+        int cb_up = atomic_read(&sop->so_client->cl_cb_set);
        struct file_lock fl, *flp = &fl;
        int status, flag = 0;
@@ -2354,7 +2411,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        open->op_recall = 0;
        switch (open->op_claim_type) {
                case NFS4_OPEN_CLAIM_PREVIOUS:
-                        if (!atomic_read(&cb->cb_set))
+                        if (!cb_up)
                                open->op_recall = 1;
                        flag = open->op_delegate_type;
                        if (flag == NFS4_OPEN_DELEGATE_NONE)
@@ -2365,7 +2422,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
                         * had the chance to reclaim theirs.... */
                        if (locks_in_grace())
                                goto out;
-                        if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
+                        if (!cb_up || !sop->so_confirmed)
                                goto out;
                        if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
                                flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2482,10 +2539,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        }
        memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
-        if (nfsd4_has_session(&resp->cstate)) {
+        if (nfsd4_has_session(&resp->cstate))
                open->op_stateowner->so_confirmed = 1;
-                nfsd4_create_clid_dir(open->op_stateowner->so_client);
-        }
        /*
        * Attempt to hand out a delegation. No error return, because the
@@ -2536,7 +2591,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        renew_client(clp);
        status = nfserr_cb_path_down;
        if (!list_empty(&clp->cl_delegations)
-                        && !atomic_read(&clp->cl_cb_conn.cb_set))
+                        && !atomic_read(&clp->cl_cb_set))
                goto out;
        status = nfs_ok;
 out:
@@ -2553,6 +2608,12 @@ nfsd4_end_grace(void)
        dprintk("NFSD: end of grace period\n");
        nfsd4_recdir_purge_old();
        locks_end_grace(&nfsd4_manager);
+        /*
+         * Now that every NFSv4 client has had the chance to recover and
+         * to see the (possibly new, possibly shorter) lease time, we
+         * can safely set the next grace time to the current lease time:
+         */
+        nfsd4_grace = nfsd4_lease;
 }
 static time_t
@@ -2562,15 +2623,17 @@ nfs4_laundromat(void)
        struct nfs4_stateowner *sop;
        struct nfs4_delegation *dp;
        struct list_head *pos, *next, reaplist;
-        time_t cutoff = get_seconds() - NFSD_LEASE_TIME;
+        time_t cutoff = get_seconds() - nfsd4_lease;
-        time_t t, clientid_val = NFSD_LEASE_TIME;
+        time_t t, clientid_val = nfsd4_lease;
-        time_t u, test_val = NFSD_LEASE_TIME;
+        time_t u, test_val = nfsd4_lease;
        nfs4_lock_state();
        dprintk("NFSD: laundromat service - starting\n");
        if (locks_in_grace())
                nfsd4_end_grace();
+        INIT_LIST_HEAD(&reaplist);
+        spin_lock(&client_lock);
        list_for_each_safe(pos, next, &client_lru) {
                clp = list_entry(pos, struct nfs4_client, cl_lru);
                if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -2579,12 +2642,22 @@ nfs4_laundromat(void)
                                clientid_val = t;
                        break;
                }
+                if (atomic_read(&clp->cl_refcount)) {
+                        dprintk("NFSD: client in use (clientid %08x)\n",
+                                clp->cl_clientid.cl_id);
+                        continue;
+                }
+                unhash_client_locked(clp);
+                list_add(&clp->cl_lru, &reaplist);
+        }
+        spin_unlock(&client_lock);
+        list_for_each_safe(pos, next, &reaplist) {
+                clp = list_entry(pos, struct nfs4_client, cl_lru);
                dprintk("NFSD: purging unused client (clientid %08x)\n",
                        clp->cl_clientid.cl_id);
                nfsd4_remove_clid_dir(clp);
                expire_client(clp);
        }
-        INIT_LIST_HEAD(&reaplist);
        spin_lock(&recall_lock);
        list_for_each_safe(pos, next, &del_recall_lru) {
                dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
@@ -2604,7 +2677,7 @@ nfs4_laundromat(void)
                list_del_init(&dp->dl_recall_lru);
                unhash_delegation(dp);
        }
-        test_val = NFSD_LEASE_TIME;
+        test_val = nfsd4_lease;
        list_for_each_safe(pos, next, &close_lru) {
                sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
                if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
@@ -2660,39 +2733,11 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
 static int
 STALE_STATEID(stateid_t *stateid)
 {
-        if (time_after((unsigned long)boot_time,
+        if (stateid->si_boot == boot_time)
-                        (unsigned long)stateid->si_boot)) {
+                return 0;
-                dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
+        dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
-                        STATEID_VAL(stateid));
-                return 1;
-        }
-        return 0;
-}
-static int
-EXPIRED_STATEID(stateid_t *stateid)
-{
-        if (time_before((unsigned long)boot_time,
-                        ((unsigned long)stateid->si_boot)) &&
-            time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
-                dprintk("NFSD: expired stateid " STATEID_FMT "!\n",
-                        STATEID_VAL(stateid));
-                return 1;
-        }
-        return 0;
-}
-static __be32
-stateid_error_map(stateid_t *stateid)
-{
-        if (STALE_STATEID(stateid))
-                return nfserr_stale_stateid;
-        if (EXPIRED_STATEID(stateid))
-                return nfserr_expired;
-        dprintk("NFSD: bad stateid " STATEID_FMT "!\n",
                STATEID_VAL(stateid));
-        return nfserr_bad_stateid;
+        return 1;
 }
 static inline int
@@ -2816,10 +2861,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
        status = nfserr_bad_stateid;
        if (is_delegation_stateid(stateid)) {
                dp = find_delegation_stateid(ino, stateid);
-                if (!dp) {
+                if (!dp)
-                        status = stateid_error_map(stateid);
                        goto out;
-                }
                status = check_stateid_generation(stateid, &dp->dl_stateid,
                                                  flags);
                if (status)
@@ -2832,10 +2875,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                        *filpp = dp->dl_vfs_file;
        } else { /* open or lock stateid */
                stp = find_stateid(stateid, flags);
-                if (!stp) {
+                if (!stp)
-                        status = stateid_error_map(stateid);
                        goto out;
-                }
                if (nfs4_check_fh(current_fh, stp))
                        goto out;
                if (!stp->st_stateowner->so_confirmed)
@@ -2907,7 +2948,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
                 */
                sop = search_close_lru(stateid->si_stateownerid, flags);
                if (sop == NULL)
-                        return stateid_error_map(stateid);
+                        return nfserr_bad_stateid;
                *sopp = sop;
                goto check_replay;
        }
@@ -3174,10 +3215,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (!is_delegation_stateid(stateid))
                goto out;
        dp = find_delegation_stateid(inode, stateid);
-        if (!dp) {
+        if (!dp)
-                status = stateid_error_map(stateid);
                goto out;
-        }
        status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
        if (status)
                goto out;
@@ -3403,7 +3442,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
        stp->st_stateowner = sop;
        get_nfs4_file(fp);
        stp->st_file = fp;
-        stp->st_stateid.si_boot = get_seconds();
+        stp->st_stateid.si_boot = boot_time;
        stp->st_stateid.si_stateownerid = sop->so_id;
        stp->st_stateid.si_fileid = fp->fi_id;
        stp->st_stateid.si_generation = 0;
@@ -3975,12 +4014,6 @@ nfsd4_load_reboot_recovery_data(void)
                printk("NFSD: Failure reading reboot recovery data\n");
 }
-unsigned long
-get_nfs4_grace_period(void)
-{
-        return max(user_lease_time, lease_time) * HZ;
-}
 /*
 * Since the lifetime of a delegation isn't limited to that of an open, a
 * client may quite reasonably hang on to a delegation as long as it has
@@ -4007,20 +4040,27 @@ set_max_delegations(void)
 static int
 __nfs4_state_start(void)
 {
-        unsigned long grace_time;
+        int ret;
        boot_time = get_seconds();
-        grace_time = get_nfs4_grace_period();
-        lease_time = user_lease_time;
        locks_start_grace(&nfsd4_manager);
        printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
-               grace_time/HZ);
+               nfsd4_grace);
+        ret = set_callback_cred();
+        if (ret)
+                return -ENOMEM;
        laundry_wq = create_singlethread_workqueue("nfsd4");
        if (laundry_wq == NULL)
                return -ENOMEM;
-        queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
+        ret = nfsd4_create_callback_queue();
+        if (ret)
+                goto out_free_laundry;
+        queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ);
        set_max_delegations();
-        return set_callback_cred();
+        return 0;
+out_free_laundry:
+        destroy_workqueue(laundry_wq);
+        return ret;
 }
 int
@@ -4038,12 +4078,6 @@ nfs4_state_start(void)
        return 0;
 }
-time_t
-nfs4_lease_time(void)
-{
-        return lease_time;
-}
 static void
 __nfs4_state_shutdown(void)
 {
@@ -4088,6 +4122,7 @@ nfs4_state_shutdown(void)
        nfs4_lock_state();
        nfs4_release_reclaim();
        __nfs4_state_shutdown();
+        nfsd4_destroy_callback_queue();
        nfs4_unlock_state();
 }
@@ -4127,21 +4162,3 @@ nfs4_recoverydir(void)
 {
        return user_recovery_dirname;
 }
-/*
- * Called when leasetime is changed.
- *
- * The only way the protocol gives us to handle on-the-fly lease changes is to
- * simulate a reboot.  Instead of doing that, we just wait till the next time
- * we start to register any changes in lease time.  If the administrator
- * really wants to change the lease time *now*, they can go ahead and bring
- * nfsd down and then back up again after changing the lease time.
- *
- * user_lease_time is protected by nfsd_mutex since it's only really accessed
- * when nfsd is starting
- */
-void
-nfs4_reset_lease(time_t leasetime)
-{
-        user_lease_time = leasetime;
-}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 78c7e24e5129..ac17a7080239 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -40,6 +40,7 @@
 * at the end of nfs4svc_decode_compoundargs.
 */
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/statfs.h>
 #include <linux/utsname.h>
@@ -160,10 +161,10 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
        argp->p = page_address(argp->pagelist[0]);
        argp->pagelist++;
        if (argp->pagelen < PAGE_SIZE) {
-                argp->end = p + (argp->pagelen>>2);
+                argp->end = argp->p + (argp->pagelen>>2);
                argp->pagelen = 0;
        } else {
-                argp->end = p + (PAGE_SIZE>>2);
+                argp->end = argp->p + (PAGE_SIZE>>2);
                argp->pagelen -= PAGE_SIZE;
        }
        memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
@@ -1233,6 +1234,16 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
        DECODE_TAIL;
 }
+static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
+{
+        DECODE_HEAD;
+        READ_BUF(4);
+        READ32(rc->rca_one_fs);
+        DECODE_TAIL;
+}
 static __be32
 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 {
@@ -1345,7 +1356,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
        [OP_TEST_STATEID]       = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_WANT_DELEGATION]    = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_DESTROY_CLIENTID]   = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_RECLAIM_COMPLETE]   = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RECLAIM_COMPLETE]   = (nfsd4_dec)nfsd4_decode_reclaim_complete,
 };
 struct nfsd4_minorversion_ops {
@@ -1425,10 +1436,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
                        argp->p = page_address(argp->pagelist[0]);
                        argp->pagelist++;
                        if (argp->pagelen < PAGE_SIZE) {
-                                argp->end = p + (argp->pagelen>>2);
+                                argp->end = argp->p + (argp->pagelen>>2);
                                argp->pagelen = 0;
                        } else {
-                                argp->end = p + (PAGE_SIZE>>2);
+                                argp->end = argp->p + (PAGE_SIZE>>2);
                                argp->pagelen -= PAGE_SIZE;
                        }
                }
@@ -1528,7 +1539,7 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
        } } while (0);
 /* Encode as an array of strings the string given with components
- * seperated @sep.
+ * separated @sep.
 */
 static __be32 nfsd4_encode_components(char sep, char *components,
                                   __be32 **pp, int *buflen)
@@ -1899,7 +1910,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
                if ((buflen -= 4) < 0)
                        goto out_resource;
-                WRITE32(NFSD_LEASE_TIME);
+                WRITE32(nfsd4_lease);
        }
        if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
                if ((buflen -= 4) < 0)
@@ -3306,11 +3317,14 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
                iov = &rqstp->rq_res.head[0];
        iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
        BUG_ON(iov->iov_len > PAGE_SIZE);
-        if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) {
+        if (nfsd4_has_session(cs)) {
-                nfsd4_store_cache_entry(resp);
+                if (cs->status != nfserr_replay_cache) {
-                dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+                        nfsd4_store_cache_entry(resp);
-                resp->cstate.slot->sl_inuse = false;
+                        dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
-                nfsd4_put_session(resp->cstate.session);
+                        cs->slot->sl_inuse = false;
+                }
+                /* Renew the clientid on success and on replay */
+                release_session_client(cs->session);
        }
        return 1;
 }
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index da08560c4818..4666a209678a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -8,6 +8,8 @@
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */
+#include <linux/slab.h>
 #include "nfsd.h"
 #include "cache.h"
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 0f0e77f2012f..bc3194ea01f5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -4,6 +4,7 @@
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/ctype.h>
@@ -45,6 +46,7 @@ enum {
         */
 #ifdef CONFIG_NFSD_V4
        NFSD_Leasetime,
+        NFSD_Gracetime,
        NFSD_RecoveryDir,
 #endif
 };
@@ -69,6 +71,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size);
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
 #ifdef CONFIG_NFSD_V4
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
@@ -90,6 +93,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_MaxBlkSize] = write_maxblksize,
 #ifdef CONFIG_NFSD_V4
        [NFSD_Leasetime] = write_leasetime,
+        [NFSD_Gracetime] = write_gracetime,
        [NFSD_RecoveryDir] = write_recoverydir,
 #endif
 };
@@ -1203,29 +1207,45 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 }
 #ifdef CONFIG_NFSD_V4
-extern time_t nfs4_leasetime(void);
+static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
-static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
 {
-        /* if size > 10 seconds, call
-         * nfs4_reset_lease() then write out the new lease (seconds) as reply
-         */
        char *mesg = buf;
-        int rv, lease;
+        int rv, i;
        if (size > 0) {
                if (nfsd_serv)
                        return -EBUSY;
-                rv = get_int(&mesg, &lease);
+                rv = get_int(&mesg, &i);
                if (rv)
                        return rv;
-                if (lease < 10 || lease > 3600)
+                /*
+                 * Some sanity checking.  We don't have a reason for
+                 * these particular numbers, but problems with the
+                 * extremes are:
+                 *      - Too short: the briefest network outage may
+                 *        cause clients to lose all their locks.  Also,
+                 *        the frequent polling may be wasteful.
+                 *      - Too long: do you really want reboot recovery
+                 *        to take more than an hour?  Or to make other
+                 *        clients wait an hour before being able to
+                 *        revoke a dead client's locks?
+                 */
+                if (i < 10 || i > 3600)
                        return -EINVAL;
-                nfs4_reset_lease(lease);
+                *time = i;
        }
-        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n",
+        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
-                                                        nfs4_lease_time());
+}
+static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
+{
+        ssize_t rv;
+        mutex_lock(&nfsd_mutex);
+        rv = __nfsd4_write_time(file, buf, size, time);
+        mutex_unlock(&nfsd_mutex);
+        return rv;
 }
 /**
@@ -1251,12 +1271,22 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
 */
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 {
-        ssize_t rv;
+        return nfsd4_write_time(file, buf, size, &nfsd4_lease);
+}
-        mutex_lock(&nfsd_mutex);
+/**
-        rv = __write_leasetime(file, buf, size);
+ * write_gracetime - Set or report current NFSv4 grace period time
-        mutex_unlock(&nfsd_mutex);
+ *
-        return rv;
+ * As above, but sets the time of the NFSv4 grace period.
+ *
+ * Note this should never be set to less than the *previous*
+ * lease-period time, but we don't try to enforce this.  (In the common
+ * case (a new boot), we don't know what the previous lease time was
+ * anyway.)
+ */
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
+{
+        return nfsd4_write_time(file, buf, size, &nfsd4_grace);
 }
 extern char *nfs4_recoverydir(void);
@@ -1350,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
 #ifdef CONFIG_NFSD_V4
                [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
+                [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
 #endif
                /* last one */ {""}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index e942a1aaac92..72377761270e 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -82,7 +82,6 @@ int nfs4_state_init(void);
 void nfsd4_free_slabs(void);
 int nfs4_state_start(void);
 void nfs4_state_shutdown(void);
-time_t nfs4_lease_time(void);
 void nfs4_reset_lease(time_t leasetime);
 int nfs4_reset_recoverydir(char *recdir);
 #else
@@ -90,7 +89,6 @@ static inline int nfs4_state_init(void) { return 0; }
 static inline void nfsd4_free_slabs(void) { }
 static inline int nfs4_state_start(void) { return 0; }
 static inline void nfs4_state_shutdown(void) { }
-static inline time_t nfs4_lease_time(void) { return 0; }
 static inline void nfs4_reset_lease(time_t leasetime) { }
 static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
 #endif
@@ -229,6 +227,9 @@ extern struct timeval	nfssvc_boot;
 #ifdef CONFIG_NFSD_V4
+extern time_t nfsd4_lease;
+extern time_t nfsd4_grace;
 /* before processing a COMPOUND operation, we have to check that there
 * is enough space in the buffer for XDR encode to succeed.  otherwise,
 * we might process an operation with side effects, and be unable to
@@ -247,7 +248,6 @@ extern struct timeval	nfssvc_boot;
 #define COMPOUND_SLACK_SPACE            140    /* OP_GETFH */
 #define COMPOUND_ERR_SLACK_SPACE        12     /* OP_SETATTR */
-#define NFSD_LEASE_TIME                 (nfs4_lease_time())
 #define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
 /*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 171699eb07c8..06b2a26edfe0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -120,7 +120,7 @@ u32 nfsd_supported_minorversion;
 int nfsd_vers(int vers, enum vers_op change)
 {
        if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
-                return -1;
+                return 0;
        switch(change) {
        case NFSD_SET:
                nfsd_versions[vers] = nfsd_version[vers];
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index fefeae27f25e..006c84230c7c 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -70,6 +70,16 @@ struct nfsd4_cb_sequence {
        struct nfs4_client      *cbs_clp;
 };
+struct nfs4_rpc_args {
+        void                            *args_op;
+        struct nfsd4_cb_sequence        args_seq;
+};
+struct nfsd4_callback {
+        struct nfs4_rpc_args cb_args;
+        struct work_struct cb_work;
+};
 struct nfs4_delegation {
        struct list_head        dl_perfile;
        struct list_head        dl_perclnt;
@@ -86,6 +96,7 @@ struct nfs4_delegation {
        stateid_t               dl_stateid;
        struct knfsd_fh         dl_fh;
        int                     dl_retries;
+        struct nfsd4_callback   dl_recall;
 };
 /* client delegation callback info */
@@ -96,9 +107,7 @@ struct nfs4_cb_conn {
        u32                     cb_prog;
        u32                     cb_minorversion;
        u32                     cb_ident;       /* minorversion 0 only */
-        /* RPC client info */
+        struct svc_xprt         *cb_xprt;       /* minorversion 1 only */
-        atomic_t                cb_set;     /* successful CB_NULL call */
-        struct rpc_clnt *       cb_client;
 };
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
@@ -157,7 +166,7 @@ struct nfsd4_session {
        struct list_head        se_hash;        /* hash by sessionid */
        struct list_head        se_perclnt;
        u32                     se_flags;
-        struct nfs4_client      *se_client;     /* for expire_client */
+        struct nfs4_client      *se_client;
        struct nfs4_sessionid   se_sessionid;
        struct nfsd4_channel_attrs se_fchannel;
        struct nfsd4_channel_attrs se_bchannel;
@@ -212,25 +221,41 @@ struct nfs4_client {
        struct svc_cred         cl_cred;        /* setclientid principal */
        clientid_t              cl_clientid;    /* generated by server */
        nfs4_verifier           cl_confirm;     /* generated by server */
-        struct nfs4_cb_conn     cl_cb_conn;     /* callback info */
-        atomic_t                cl_count;       /* ref count */
        u32                     cl_firststate;  /* recovery dir creation */
+        /* for v4.0 and v4.1 callbacks: */
+        struct nfs4_cb_conn     cl_cb_conn;
+        struct rpc_clnt         *cl_cb_client;
+        atomic_t                cl_cb_set;
        /* for nfs41 */
        struct list_head        cl_sessions;
        struct nfsd4_clid_slot  cl_cs_slot;     /* create_session slot */
        u32                     cl_exchange_flags;
        struct nfs4_sessionid   cl_sessionid;
+        /* number of rpc's in progress over an associated session: */
+        atomic_t                cl_refcount;
        /* for nfs41 callbacks */
        /* We currently support a single back channel with a single slot */
        unsigned long           cl_cb_slot_busy;
        u32                     cl_cb_seq_nr;
-        struct svc_xprt         *cl_cb_xprt;    /* 4.1 callback transport */
        struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
                                                /* wait here for slots */
 };
+static inline void
+mark_client_expired(struct nfs4_client *clp)
+{
+        clp->cl_time = 0;
+}
+static inline bool
+is_client_expired(struct nfs4_client *clp)
+{
+        return clp->cl_time == 0;
+}
 /* struct nfs4_client_reset
 * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
 * upon lease reset, or from upcall to state_daemon (to read in state
@@ -377,11 +402,14 @@ extern void nfs4_lock_state(void);
 extern void nfs4_unlock_state(void);
 extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
-extern void put_nfs4_client(struct nfs4_client *clp);
 extern void nfs4_free_stateowner(struct kref *kref);
 extern int set_callback_cred(void);
-extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
+extern int nfsd4_create_callback_queue(void);
+extern void nfsd4_destroy_callback_queue(void);
+extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
 extern void nfsd4_init_recdir(char *recdir_name);
@@ -392,6 +420,7 @@ extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
 extern void nfsd4_recdir_purge_old(void);
 extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
 extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
+extern void release_session_client(struct nfsd4_session *);
 static inline void
 nfs4_put_stateowner(struct nfs4_stateowner *so)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a11b0e8678ee..23c06f77f4ca 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -25,6 +25,7 @@
 #include <linux/xattr.h>
 #include <linux/jhash.h>
 #include <linux/ima.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
@@ -723,7 +724,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        struct inode    *inode;
        int             flags = O_RDONLY|O_LARGEFILE;
        __be32          err;
-        int             host_err;
+        int             host_err = 0;
        validate_process_creds();
@@ -760,7 +761,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
         * Check to see if there are any leases on this file.
         * This may block while leases are broken.
         */
-        host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
+        if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
+                host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
        if (host_err == -EWOULDBLOCK)
                host_err = -ETIMEDOUT;
        if (host_err) /* NOMEM or WOULDBLOCK */
@@ -1168,7 +1170,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
                        goto out;
        }
-        err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+        err = nfsd_open(rqstp, fhp, S_IFREG,
+                        NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
        if (err)
                goto out;
        if (EX_ISSYNC(fhp->fh_export)) {
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 4b1de0a9ea75..217a62c2a357 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -20,6 +20,7 @@
 #define NFSD_MAY_OWNER_OVERRIDE 64
 #define NFSD_MAY_LOCAL_ACCESS   128 /* IRIX doing local access check on device special file*/
 #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
+#define NFSD_MAY_NOT_BREAK_LEASE 512
 #define NFSD_MAY_CREATE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
 #define NFSD_MAY_REMOVE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index efa337739534..4d476ff08ae6 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -381,6 +381,10 @@ struct nfsd4_destroy_session {
        struct nfs4_sessionid   sessionid;
 };
+struct nfsd4_reclaim_complete {
+        u32 rca_one_fs;
+};
 struct nfsd4_op {
        int                                     opnum;
        __be32                                  status;
@@ -421,6 +425,7 @@ struct nfsd4_op {
                struct nfsd4_create_session     create_session;
                struct nfsd4_destroy_session    destroy_session;
                struct nfsd4_sequence           sequence;
+                struct nfsd4_reclaim_complete   reclaim_complete;
        } u;
        struct nfs4_replay *                    replay;
 };
@@ -513,9 +518,8 @@ extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
 extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
                struct nfsd4_sequence *seq);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
-                struct nfsd4_compound_state *,
+                struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
-struct nfsd4_exchange_id *);
+extern __be32 nfsd4_create_session(struct svc_rqst *,
-                extern __be32 nfsd4_create_session(struct svc_rqst *,
                struct nfsd4_compound_state *,
                struct nfsd4_create_session *);
 extern __be32 nfsd4_sequence(struct svc_rqst *,
@@ -524,6 +528,7 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
 extern __be32 nfsd4_destroy_session(struct svc_rqst *,
                struct nfsd4_compound_state *,
                struct nfsd4_destroy_session *);
+__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
 extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
                struct nfsd4_open *open);
 extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 3f959f1879d8..d7fd696e595c 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -26,10 +26,16 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/bitops.h>
+#include <linux/slab.h>
 #include "mdt.h"
 #include "alloc.h"
+/**
+ * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
+ *                                      descriptor block can maintain
+ * @inode: inode of metadata file using this allocator
+ */
 static inline unsigned long
 nilfs_palloc_groups_per_desc_block(const struct inode *inode)
 {
@@ -37,12 +43,21 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode)
                sizeof(struct nilfs_palloc_group_desc);
 }
+/**
+ * nilfs_palloc_groups_count - get maximum number of groups
+ * @inode: inode of metadata file using this allocator
+ */
 static inline unsigned long
 nilfs_palloc_groups_count(const struct inode *inode)
 {
        return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
 }
+/**
+ * nilfs_palloc_init_blockgroup - initialize private variables for allocator
+ * @inode: inode of metadata file using this allocator
+ * @entry_size: size of the persistent object
+ */
 int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
 {
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -68,6 +83,12 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
        return 0;
 }
+/**
+ * nilfs_palloc_group - get group number and offset from an entry number
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @offset: pointer to store offset number in the group
+ */
 static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
                                        unsigned long *offset)
 {
@@ -77,6 +98,14 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
        return group;
 }
+/**
+ * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_desc_blkoff() returns block offset of the descriptor
+ * block which contains a descriptor of the specified group.
+ */
 static unsigned long
 nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
 {
@@ -85,6 +114,14 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
        return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
 }
+/**
+ * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
+ * block used to allocate/deallocate entries in the specified group.
+ */
 static unsigned long
 nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
 {
@@ -94,6 +131,12 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
                desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
 }
+/**
+ * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ */
 static unsigned long
 nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
                               const struct nilfs_palloc_group_desc *desc)
@@ -106,6 +149,13 @@ nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
        return nfree;
 }
+/**
+ * nilfs_palloc_group_desc_add_entries - adjust count of free entries
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ * @n: delta to be added
+ */
 static void
 nilfs_palloc_group_desc_add_entries(struct inode *inode,
                                    unsigned long group,
@@ -117,6 +167,11 @@ nilfs_palloc_group_desc_add_entries(struct inode *inode,
        spin_unlock(nilfs_mdt_bgl_lock(inode, group));
 }
+/**
+ * nilfs_palloc_entry_blkoff - get block offset of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ */
 static unsigned long
 nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
 {
@@ -128,6 +183,12 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
                group_offset / NILFS_MDT(inode)->mi_entries_per_block;
 }
+/**
+ * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block
+ * @inode: inode of metadata file
+ * @bh: buffer head of the buffer to be initialized
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 static void nilfs_palloc_desc_block_init(struct inode *inode,
                                         struct buffer_head *bh, void *kaddr)
 {
@@ -178,6 +239,13 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
        return ret;
 }
+/**
+ * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 static int nilfs_palloc_get_desc_block(struct inode *inode,
                                       unsigned long group,
                                       int create, struct buffer_head **bhp)
@@ -190,6 +258,13 @@ static int nilfs_palloc_get_desc_block(struct inode *inode,
                                      bhp, &cache->prev_desc, &cache->lock);
 }
+/**
+ * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 static int nilfs_palloc_get_bitmap_block(struct inode *inode,
                                         unsigned long group,
                                         int create, struct buffer_head **bhp)
@@ -202,6 +277,13 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
                                      &cache->prev_bitmap, &cache->lock);
 }
+/**
+ * nilfs_palloc_get_entry_block - get buffer head of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
                                 int create, struct buffer_head **bhp)
 {
@@ -213,6 +295,13 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
                                      &cache->prev_entry, &cache->lock);
 }
+/**
+ * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @bh: buffer head of the buffer storing the group descriptor block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 static struct nilfs_palloc_group_desc *
 nilfs_palloc_block_get_group_desc(const struct inode *inode,
                                  unsigned long group,
@@ -222,6 +311,13 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
                group % nilfs_palloc_groups_per_desc_block(inode);
 }
+/**
+ * nilfs_palloc_block_get_entry - get kernel address of an entry
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @bh: buffer head of the buffer storing the entry block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
                                   const struct buffer_head *bh, void *kaddr)
 {
@@ -234,11 +330,19 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
                entry_offset * NILFS_MDT(inode)->mi_entry_size;
 }
+/**
+ * nilfs_palloc_find_available_slot - find available slot in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @target: offset number of an entry in the group (start point)
+ * @bitmap: bitmap of the group
+ * @bsize: size in bits
+ */
 static int nilfs_palloc_find_available_slot(struct inode *inode,
                                            unsigned long group,
                                            unsigned long target,
                                            unsigned char *bitmap,
-                                            int bsize)  /* size in bits */
+                                            int bsize)
 {
        int curr, pos, end, i;
@@ -276,6 +380,13 @@ static int nilfs_palloc_find_available_slot(struct inode *inode,
        return -ENOSPC;
 }
+/**
+ * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups
+ *                                          in a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @curr: current group number
+ * @max: maximum number of groups
+ */
 static unsigned long
 nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
                                       unsigned long curr, unsigned long max)
@@ -286,6 +397,11 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
                     max - curr + 1);
 }
+/**
+ * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
                                     struct nilfs_palloc_req *req)
 {
@@ -365,6 +481,11 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
        return ret;
 }
+/**
+ * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 void nilfs_palloc_commit_alloc_entry(struct inode *inode,
                                     struct nilfs_palloc_req *req)
 {
@@ -376,6 +497,11 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode,
        brelse(req->pr_desc_bh);
 }
+/**
+ * nilfs_palloc_commit_free_entry - finish deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 void nilfs_palloc_commit_free_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
 {
@@ -409,6 +535,11 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
        brelse(req->pr_desc_bh);
 }
+/**
+ * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 void nilfs_palloc_abort_alloc_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
 {
@@ -425,7 +556,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
        bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
        if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
                                    group_offset, bitmap))
-                printk(KERN_WARNING "%s: entry numer %llu already freed\n",
+                printk(KERN_WARNING "%s: entry number %llu already freed\n",
                       __func__, (unsigned long long)req->pr_entry_nr);
        nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
@@ -441,6 +572,11 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
        req->pr_desc_bh = NULL;
 }
+/**
+ * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 int nilfs_palloc_prepare_free_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
 {
@@ -463,6 +599,11 @@ int nilfs_palloc_prepare_free_entry(struct inode *inode,
        return 0;
 }
+/**
+ * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 void nilfs_palloc_abort_free_entry(struct inode *inode,
                                   struct nilfs_palloc_req *req)
 {
@@ -474,6 +615,12 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
        req->pr_desc_bh = NULL;
 }
+/**
+ * nilfs_palloc_group_is_in - judge if an entry is in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @nr: serial number of the entry (e.g. inode number)
+ */
 static int
 nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
 {
@@ -484,6 +631,12 @@ nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
        return (nr >= first) && (nr <= last);
 }
+/**
+ * nilfs_palloc_freev - deallocate a set of persistent objects
+ * @inode: inode of metadata file using this allocator
+ * @entry_nrs: array of entry numbers to be deallocated
+ * @nitems: number of entries stored in @entry_nrs
+ */
 int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 {
        struct buffer_head *desc_bh, *bitmap_bh;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index f4543ac4f560..9af34a7e6e13 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -29,6 +29,13 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
+/**
+ * nilfs_palloc_entries_per_group - get the number of entries per group
+ * @inode: inode of metadata file using this allocator
+ *
+ * The number of entries per group is defined by the number of bits
+ * that a bitmap block can maintain.
+ */
 static inline unsigned long
 nilfs_palloc_entries_per_group(const struct inode *inode)
 {
@@ -42,7 +49,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
                                   const struct buffer_head *, void *);
 /**
- * nilfs_palloc_req - persistent alloctor request and reply
+ * nilfs_palloc_req - persistent allocator request and reply
 * @pr_entry_nr: entry number (vblocknr or inode number)
 * @pr_desc_bh: buffer head of the buffer containing block group descriptors
 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 471e269536ae..447ce47a3306 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -27,6 +27,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mm.h>
 #include <linux/backing-dev.h>
+#include <linux/gfp.h>
 #include "nilfs.h"
 #include "mdt.h"
 #include "dat.h"
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 7cdd98b8d514..b27a342c5af6 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,63 +31,16 @@
 #include "alloc.h"
 #include "dat.h"
-/**
+static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
- * struct nilfs_btree_path - A path on which B-tree operations are executed
- * @bp_bh: buffer head of node block
- * @bp_sib_bh: buffer head of sibling node block
- * @bp_index: index of child node
- * @bp_oldreq: ptr end request for old ptr
- * @bp_newreq: ptr alloc request for new ptr
- * @bp_op: rebalance operation
- */
-struct nilfs_btree_path {
-        struct buffer_head *bp_bh;
-        struct buffer_head *bp_sib_bh;
-        int bp_index;
-        union nilfs_bmap_ptr_req bp_oldreq;
-        union nilfs_bmap_ptr_req bp_newreq;
-        struct nilfs_btnode_chkey_ctxt bp_ctxt;
-        void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
-                      int, __u64 *, __u64 *);
-};
-/*
- * B-tree path operations
- */
-static struct kmem_cache *nilfs_btree_path_cache;
-int __init nilfs_btree_path_cache_init(void)
-{
-        nilfs_btree_path_cache =
-                kmem_cache_create("nilfs2_btree_path_cache",
-                                  sizeof(struct nilfs_btree_path) *
-                                  NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
-        return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
-}
-void nilfs_btree_path_cache_destroy(void)
-{
-        kmem_cache_destroy(nilfs_btree_path_cache);
-}
-static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
-{
-        return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
-}
-static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
 {
-        kmem_cache_free(nilfs_btree_path_cache, path);
+        struct nilfs_btree_path *path;
-}
+        int level = NILFS_BTREE_LEVEL_DATA;
-static void nilfs_btree_init_path(struct nilfs_btree_path *path)
+        path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
-{
+        if (path == NULL)
-        int level;
+                goto out;
-        for (level = NILFS_BTREE_LEVEL_DATA;
+        for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
-             level < NILFS_BTREE_LEVEL_MAX;
-             level++) {
                path[level].bp_bh = NULL;
                path[level].bp_sib_bh = NULL;
                path[level].bp_index = 0;
@@ -95,15 +48,19 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path)
                path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
                path[level].bp_op = NULL;
        }
+out:
+        return path;
 }
-static void nilfs_btree_release_path(struct nilfs_btree_path *path)
+static void nilfs_btree_free_path(struct nilfs_btree_path *path)
 {
-        int level;
+        int level = NILFS_BTREE_LEVEL_DATA;
-        for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
+        for (; level < NILFS_BTREE_LEVEL_MAX; level++)
-             level++)
                brelse(path[level].bp_bh);
+        kmem_cache_free(nilfs_btree_path_cache, path);
 }
 /*
@@ -566,14 +523,12 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
        if (ptrp != NULL)
                *ptrp = ptr;
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
@@ -594,7 +549,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
        if (ret < 0)
                goto out;
@@ -655,7 +610,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        *ptrp = ptr;
        ret = cnt;
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1123,7 +1077,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
                                    NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1140,7 +1093,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1456,7 +1408,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
                                    NILFS_BTREE_LEVEL_NODE_MIN);
        if (ret < 0)
@@ -1473,7 +1425,6 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
        nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1488,11 +1439,9 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
@@ -1879,7 +1828,7 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
                                   struct nilfs_btree_path *path,
                                   int level, struct buffer_head *bh)
 {
-        int maxlevel, ret;
+        int maxlevel = 0, ret;
        struct nilfs_btree_node *parent;
        struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
        __u64 ptr;
@@ -1923,7 +1872,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        if (buffer_nilfs_node(bh)) {
                node = (struct nilfs_btree_node *)bh->b_data;
@@ -1947,7 +1895,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
                nilfs_btree_propagate_p(btree, path, level, bh);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
@@ -2108,7 +2055,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        if (buffer_nilfs_node(*bh)) {
                node = (struct nilfs_btree_node *)(*bh)->b_data;
@@ -2130,7 +2076,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
                nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
@@ -2175,7 +2120,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
        if (ret < 0) {
@@ -2195,7 +2139,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
                nilfs_bmap_set_dirty(&btree->bt_bmap);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4b82d84ade75..af638d59e3bf 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -30,9 +30,6 @@
 #include "btnode.h"
 #include "bmap.h"
-struct nilfs_btree;
-struct nilfs_btree_path;
 /**
 * struct nilfs_btree - B-tree structure
 * @bt_bmap: bmap base structure
@@ -41,6 +38,25 @@ struct nilfs_btree {
        struct nilfs_bmap bt_bmap;
 };
+/**
+ * struct nilfs_btree_path - A path on which B-tree operations are executed
+ * @bp_bh: buffer head of node block
+ * @bp_sib_bh: buffer head of sibling node block
+ * @bp_index: index of child node
+ * @bp_oldreq: ptr end request for old ptr
+ * @bp_newreq: ptr alloc request for new ptr
+ * @bp_op: rebalance operation
+ */
+struct nilfs_btree_path {
+        struct buffer_head *bp_bh;
+        struct buffer_head *bp_sib_bh;
+        int bp_index;
+        union nilfs_bmap_ptr_req bp_oldreq;
+        union nilfs_bmap_ptr_req bp_newreq;
+        struct nilfs_btnode_chkey_ctxt bp_ctxt;
+        void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
+                      int, __u64 *, __u64 *);
+};
 #define NILFS_BTREE_ROOT_SIZE           NILFS_BMAP_SIZE
 #define NILFS_BTREE_ROOT_NCHILDREN_MAX                                  \
@@ -57,6 +73,7 @@ struct nilfs_btree {
 #define NILFS_BTREE_KEY_MIN     ((__u64)0)
 #define NILFS_BTREE_KEY_MAX     (~(__u64)0)
+extern struct kmem_cache *nilfs_btree_path_cache;
 int nilfs_btree_path_cache_init(void);
 void nilfs_btree_path_cache_destroy(void);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 9d1e5de91afb..013146755683 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -288,7 +288,7 @@ int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
 * @vblocknrs and @nitems.
 *
 * Return Value: On success, 0 is returned. On error, one of the following
- * nagative error codes is returned.
+ * negative error codes is returned.
 *
 * %-EIO - I/O error.
 *
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 0092840492ee..85c89dfc71f0 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -396,7 +396,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
                /* next page is past the blocks we've got */
                if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
                        nilfs_error(dir->i_sb, __func__,
-                               "dir %lu size %lld exceeds block cout %llu",
+                               "dir %lu size %lld exceeds block count %llu",
                               dir->i_ino, dir->i_size,
                               (unsigned long long)dir->i_blocks);
                        goto out;
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e16a6664dfa2..145f03cd7d3e 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,10 +28,10 @@
 * gcinodes), and this file provides lookup function of the dummy
 * inodes and their buffer read function.
 *
- * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it
+ * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
 * has to treat blocks that belong to a same file but have different
 * checkpoint numbers.  To avoid interference among generations, dummy
- * inodes are managed separatly from actual inodes, and their lookup
+ * inodes are managed separately from actual inodes, and their lookup
 * function (nilfs_gc_iget) is designed to be specified with a
 * checkpoint number argument as well as an inode number.
 *
@@ -45,6 +45,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/hash.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include "nilfs.h"
 #include "page.h"
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7868cc122ac7..5e226d4b41d3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -22,6 +22,7 @@
 */
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
 #include <linux/uio.h>
@@ -450,7 +451,7 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
                inode->i_op = &nilfs_special_inode_operations;
                init_special_inode(
                        inode, inode->i_mode,
-                        new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+                        huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
        }
        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
        brelse(bh);
@@ -510,7 +511,7 @@ void nilfs_write_inode_common(struct inode *inode,
                nilfs_bmap_write(ii->i_bmap, raw_inode);
        else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
                raw_inode->i_device_code =
-                        cpu_to_le64(new_encode_dev(inode->i_rdev));
+                        cpu_to_le64(huge_encode_dev(inode->i_rdev));
        /* When extending inode, nilfs->ns_inode_size should be checked
           for substitutions of appended fields */
 }
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 313d0a21da48..f90a33d9a5b0 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -23,6 +23,7 @@
 #include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/smp_lock.h>     /* lock_kernel(), unlock_kernel() */
+#include <linux/slab.h>
 #include <linux/capability.h>   /* capable() */
 #include <linux/uaccess.h>      /* copy_from_user(), copy_to_user() */
 #include <linux/vmalloc.h>
@@ -648,7 +649,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
 long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = filp->f_dentry->d_inode;
-        void __user *argp = (void * __user *)arg;
+        void __user *argp = (void __user *)arg;
        switch (cmd) {
        case NILFS_IOCTL_CHANGE_CPMODE:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 06713ffcc7f2..024be8c35bb6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -26,6 +26,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/swap.h>
+#include <linux/slab.h>
 #include "nilfs.h"
 #include "segment.h"
 #include "page.h"
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a2692bbc7b50..8de3e1e48130 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -29,6 +29,7 @@
 #include <linux/list.h>
 #include <linux/highmem.h>
 #include <linux/pagevec.h>
+#include <linux/gfp.h>
 #include "nilfs.h"
 #include "page.h"
 #include "mdt.h"
@@ -292,7 +293,7 @@ void nilfs_free_private_page(struct page *page)
 * @src: source page
 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
 *
- * This fuction is for both data pages and btnode pages.  The dirty flag
+ * This function is for both data pages and btnode pages.  The dirty flag
 * should be treated by caller.  The page must not be under i/o.
 * Both src and dst page must be locked
 */
@@ -388,7 +389,7 @@ repeat:
 }
 /**
- * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache
+ * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
 * @dmap: destination page cache
 * @smap: source page cache
 *
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 017bedc761a0..bae2a516b4ee 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -23,6 +23,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
+#include <linux/slab.h>
 #include <linux/crc32.h>
 #include "nilfs.h"
 #include "segment.h"
@@ -104,6 +105,8 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
        ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
        ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
+        /* need to verify ->ss_bytes field if read ->ss_cno */
 }
 /**
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index ab56fe44e377..2e6a2723b8fa 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -25,6 +25,7 @@
 #include <linux/writeback.h>
 #include <linux/crc32.h>
 #include <linux/backing-dev.h>
+#include <linux/slab.h>
 #include "page.h"
 #include "segbuf.h"
@@ -32,42 +33,17 @@
 struct nilfs_write_info {
        struct the_nilfs       *nilfs;
        struct bio             *bio;
-        int                     start, end; /* The region to be submitted */
+        int                     start, end; /* The region to be submitted */
        int                     rest_blocks;
        int                     max_pages;
        int                     nr_vecs;
        sector_t                blocknr;
 };
 static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
                              struct the_nilfs *nilfs);
 static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
-static struct kmem_cache *nilfs_segbuf_cachep;
-static void nilfs_segbuf_init_once(void *obj)
-{
-        memset(obj, 0, sizeof(struct nilfs_segment_buffer));
-}
-int __init nilfs_init_segbuf_cache(void)
-{
-        nilfs_segbuf_cachep =
-                kmem_cache_create("nilfs2_segbuf_cache",
-                                  sizeof(struct nilfs_segment_buffer),
-                                  0, SLAB_RECLAIM_ACCOUNT,
-                                  nilfs_segbuf_init_once);
-        return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
-}
-void nilfs_destroy_segbuf_cache(void)
-{
-        kmem_cache_destroy(nilfs_segbuf_cachep);
-}
 struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
 {
        struct nilfs_segment_buffer *segbuf;
@@ -80,6 +56,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
        INIT_LIST_HEAD(&segbuf->sb_list);
        INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
        INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+        segbuf->sb_super_root = NULL;
        init_completion(&segbuf->sb_bio_event);
        atomic_set(&segbuf->sb_err, 0);
@@ -157,7 +134,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
 }
 int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
-                       time_t ctime)
+                       time_t ctime, __u64 cno)
 {
        int err;
@@ -170,11 +147,12 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
        segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
        segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
        segbuf->sb_sum.ctime = ctime;
+        segbuf->sb_sum.cno = cno;
        return 0;
 }
 /*
- * Setup segument summary
+ * Setup segment summary
 */
 void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
 {
@@ -195,13 +173,14 @@ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
        raw_sum->ss_nfinfo   = cpu_to_le32(segbuf->sb_sum.nfinfo);
        raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
        raw_sum->ss_pad      = 0;
+        raw_sum->ss_cno      = cpu_to_le64(segbuf->sb_sum.cno);
 }
 /*
 * CRC calculation routines
 */
-void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
+static void
-                                     u32 seed)
+nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed)
 {
        struct buffer_head *bh;
        struct nilfs_segment_summary *raw_sum;
@@ -228,8 +207,8 @@ void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
        raw_sum->ss_sumsum = cpu_to_le32(crc);
 }
-void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
+static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
-                                   u32 seed)
+                                          u32 seed)
 {
        struct buffer_head *bh;
        struct nilfs_segment_summary *raw_sum;
@@ -255,6 +234,20 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
        raw_sum->ss_datasum = cpu_to_le32(crc);
 }
+static void
+nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
+                                    u32 seed)
+{
+        struct nilfs_super_root *raw_sr;
+        u32 crc;
+        raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
+        crc = crc32_le(seed,
+                       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
+                       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
+        raw_sr->sr_sum = cpu_to_le32(crc);
+}
 static void nilfs_release_buffers(struct list_head *list)
 {
        struct buffer_head *bh, *n;
@@ -281,6 +274,7 @@ static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
 {
        nilfs_release_buffers(&segbuf->sb_segsum_buffers);
        nilfs_release_buffers(&segbuf->sb_payload_buffers);
+        segbuf->sb_super_root = NULL;
 }
 /*
@@ -323,14 +317,31 @@ int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
 int nilfs_wait_on_logs(struct list_head *logs)
 {
        struct nilfs_segment_buffer *segbuf;
-        int err;
+        int err, ret = 0;
        list_for_each_entry(segbuf, logs, sb_list) {
                err = nilfs_segbuf_wait(segbuf);
-                if (err)
+                if (err && !ret)
-                        return err;
+                        ret = err;
+        }
+        return ret;
+}
+/**
+ * nilfs_add_checksums_on_logs - add checksums on the logs
+ * @logs: list of segment buffers storing target logs
+ * @seed: checksum seed value
+ */
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
+{
+        struct nilfs_segment_buffer *segbuf;
+        list_for_each_entry(segbuf, logs, sb_list) {
+                if (segbuf->sb_super_root)
+                        nilfs_segbuf_fill_in_super_root_crc(segbuf, seed);
+                nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
+                nilfs_segbuf_fill_in_data_crc(segbuf, seed);
        }
-        return 0;
 }
 /*
@@ -470,8 +481,8 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
 *
 * %-ENOMEM - Insufficient memory available.
 */
-int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
-                       struct the_nilfs *nilfs)
+                              struct the_nilfs *nilfs)
 {
        struct nilfs_write_info wi;
        struct buffer_head *bh;
@@ -514,7 +525,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
 *
 * %-EIO - I/O error
 */
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
+static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
 {
        int err = 0;
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 94dfd3517bc0..fdf1c3b6d673 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -37,6 +37,7 @@
 * @sumbytes: Byte count of segment summary
 * @nfileblk: Total number of file blocks
 * @seg_seq: Segment sequence number
+ * @cno: Checkpoint number
 * @ctime: Creation time
 * @next: Block number of the next full segment
 */
@@ -48,6 +49,7 @@ struct nilfs_segsum_info {
        unsigned long           sumbytes;
        unsigned long           nfileblk;
        u64                     seg_seq;
+        __u64                   cno;
        time_t                  ctime;
        sector_t                next;
 };
@@ -76,6 +78,7 @@ struct nilfs_segsum_info {
 * @sb_rest_blocks: Number of residual blocks in the current segment
 * @sb_segsum_buffers: List of buffers for segment summaries
 * @sb_payload_buffers: List of buffers for segment payload
+ * @sb_super_root: Pointer to buffer storing a super root block (if exists)
 * @sb_nbio: Number of flying bio requests
 * @sb_err: I/O error status
 * @sb_bio_event: Completion event of log writing
@@ -95,6 +98,7 @@ struct nilfs_segment_buffer {
        /* Buffers */
        struct list_head        sb_segsum_buffers;
        struct list_head        sb_payload_buffers; /* including super root */
+        struct buffer_head     *sb_super_root;
        /* io status */
        int                     sb_nbio;
@@ -121,6 +125,7 @@ struct nilfs_segment_buffer {
                    b_assoc_buffers))
 #define NILFS_SEGBUF_BH_IS_LAST(bh, head)  ((bh)->b_assoc_buffers.next == head)
+extern struct kmem_cache *nilfs_segbuf_cachep;
 int __init nilfs_init_segbuf_cache(void);
 void nilfs_destroy_segbuf_cache(void);
@@ -132,13 +137,11 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
                           struct nilfs_segment_buffer *prev);
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
                                  struct the_nilfs *);
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
 int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
 int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
                                struct buffer_head **);
 void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
-void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
-void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
 static inline void
 nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
@@ -171,6 +174,7 @@ void nilfs_truncate_logs(struct list_head *logs,
                         struct nilfs_segment_buffer *last);
 int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
 int nilfs_wait_on_logs(struct list_head *logs);
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed);
 static inline void nilfs_destroy_logs(struct list_head *logs)
 {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index ada2f1b947a3..c9201649cc49 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -32,6 +32,7 @@
 #include <linux/kthread.h>
 #include <linux/crc32.h>
 #include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "nilfs.h"
 #include "btnode.h"
 #include "page.h"
@@ -115,42 +116,6 @@ static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
 #define nilfs_cnt32_lt(a, b)  nilfs_cnt32_gt(b, a)
 #define nilfs_cnt32_le(a, b)  nilfs_cnt32_ge(b, a)
-/*
- * Transaction
- */
-static struct kmem_cache *nilfs_transaction_cachep;
-/**
- * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
- *
- * nilfs_init_transaction_cache() creates a slab cache for the struct
- * nilfs_transaction_info.
- *
- * Return Value: On success, it returns 0. On error, one of the following
- * negative error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
- */
-int nilfs_init_transaction_cache(void)
-{
-        nilfs_transaction_cachep =
-                kmem_cache_create("nilfs2_transaction_cache",
-                                  sizeof(struct nilfs_transaction_info),
-                                  0, SLAB_RECLAIM_ACCOUNT, NULL);
-        return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
-}
-/**
- * nilfs_detroy_transaction_cache - destroy the cache for transaction info
- *
- * nilfs_destroy_transaction_cache() frees the slab cache for the struct
- * nilfs_transaction_info.
- */
-void nilfs_destroy_transaction_cache(void)
-{
-        kmem_cache_destroy(nilfs_transaction_cachep);
-}
 static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
 {
        struct nilfs_transaction_info *cur_ti = current->journal_info;
@@ -201,7 +166,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
 * This function allocates a nilfs_transaction_info struct to keep context
 * information on it.  It is initialized and hooked onto the current task in
 * the outermost call.  If a pre-allocated struct is given to @ti, it is used
- * instead; othewise a new struct is assigned from a slab.
+ * instead; otherwise a new struct is assigned from a slab.
 *
 * When @vacancy_check flag is set, this function will check the amount of
 * free space, and will wait for the GC to reclaim disk space if low capacity.
@@ -401,7 +366,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
        if (nilfs_doing_gc())
                flags = NILFS_SS_GC;
-        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
+        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
+                                 sci->sc_sbi->s_nilfs->ns_cno);
        if (unlikely(err))
                return err;
@@ -434,7 +400,7 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
                        return err;
                segbuf = sci->sc_curseg;
        }
-        err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
+        err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root);
        if (likely(!err))
                segbuf->sb_sum.flags |= NILFS_SS_SR;
        return err;
@@ -598,7 +564,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
        *vblocknr = binfo->bi_v.bi_vblocknr;
 }
-struct nilfs_sc_operations nilfs_sc_file_ops = {
+static struct nilfs_sc_operations nilfs_sc_file_ops = {
        .collect_data = nilfs_collect_file_data,
        .collect_node = nilfs_collect_file_node,
        .collect_bmap = nilfs_collect_file_bmap,
@@ -648,7 +614,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
        *binfo_dat = binfo->bi_dat;
 }
-struct nilfs_sc_operations nilfs_sc_dat_ops = {
+static struct nilfs_sc_operations nilfs_sc_dat_ops = {
        .collect_data = nilfs_collect_dat_data,
        .collect_node = nilfs_collect_file_node,
        .collect_bmap = nilfs_collect_dat_bmap,
@@ -656,7 +622,7 @@ struct nilfs_sc_operations nilfs_sc_dat_ops = {
        .write_node_binfo = nilfs_write_dat_node_binfo,
 };
-struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
        .collect_data = nilfs_collect_file_data,
        .collect_node = NULL,
        .collect_bmap = NULL,
@@ -931,43 +897,16 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
        }
 }
-/*
- * CRC calculation routines
- */
-static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
-{
-        struct nilfs_super_root *raw_sr =
-                (struct nilfs_super_root *)bh_sr->b_data;
-        u32 crc;
-        crc = crc32_le(seed,
-                       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
-                       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
-        raw_sr->sr_sum = cpu_to_le32(crc);
-}
-static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
-                                            u32 seed)
-{
-        struct nilfs_segment_buffer *segbuf;
-        if (sci->sc_super_root)
-                nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
-        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-                nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
-                nilfs_segbuf_fill_in_data_crc(segbuf, seed);
-        }
-}
 static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
                                             struct the_nilfs *nilfs)
 {
-        struct buffer_head *bh_sr = sci->sc_super_root;
+        struct buffer_head *bh_sr;
-        struct nilfs_super_root *raw_sr =
+        struct nilfs_super_root *raw_sr;
-                (struct nilfs_super_root *)bh_sr->b_data;
        unsigned isz = nilfs->ns_inode_size;
+        bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
+        raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
        raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
        raw_sr->sr_nongc_ctime
                = cpu_to_le64(nilfs_doing_gc() ?
@@ -1490,7 +1429,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
        /* Collection retry loop */
        for (;;) {
-                sci->sc_super_root = NULL;
                sci->sc_nblk_this_inc = 0;
                sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
@@ -1510,6 +1448,12 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
                        break;
+                nilfs_clear_logs(&sci->sc_segbufs);
+                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+                if (unlikely(err))
+                        return err;
                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
                        err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
                                                        sci->sc_freesegs,
@@ -1517,12 +1461,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                                                        NULL);
                        WARN_ON(err); /* do not happen */
                }
-                nilfs_clear_logs(&sci->sc_segbufs);
-                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
-                if (unlikely(err))
-                        return err;
                nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
                sci->sc_stage = prev_stage;
        }
@@ -1567,7 +1505,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
        ssp.offset = sizeof(struct nilfs_segment_summary);
        list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
-                if (bh == sci->sc_super_root)
+                if (bh == segbuf->sb_super_root)
                        break;
                if (!finfo) {
                        finfo = nilfs_segctor_map_segsum_entry(
@@ -1728,7 +1666,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
                                    b_assoc_buffers) {
-                        if (bh == sci->sc_super_root) {
+                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        lock_page(bd_page);
                                        clear_page_dirty_for_io(bd_page);
@@ -1847,7 +1785,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
 }
 static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
-                             struct buffer_head *bh_sr, int err)
+                             int err)
 {
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
@@ -1868,7 +1806,7 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
                                    b_assoc_buffers) {
-                        if (bh == bh_sr) {
+                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        end_page_writeback(bd_page);
                                        bd_page = bh->b_page;
@@ -1897,8 +1835,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
        list_splice_tail_init(&sci->sc_write_logs, &logs);
        ret = nilfs_wait_on_logs(&logs);
-        if (ret)
+        nilfs_abort_logs(&logs, NULL, ret ? : err);
-                nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
        list_splice_tail_init(&sci->sc_segbufs, &logs);
        nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
@@ -1914,7 +1851,6 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
        }
        nilfs_destroy_logs(&logs);
-        sci->sc_super_root = NULL;
 }
 static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1933,7 +1869,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
-        int update_sr = (sci->sc_super_root != NULL);
+        int update_sr = false;
        list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
                struct buffer_head *bh;
@@ -1964,11 +1900,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
                        set_buffer_uptodate(bh);
                        clear_buffer_dirty(bh);
                        clear_buffer_nilfs_volatile(bh);
-                        if (bh == sci->sc_super_root) {
+                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        end_page_writeback(bd_page);
                                        bd_page = bh->b_page;
                                }
+                                update_sr = true;
                                break;
                        }
                        if (bh->b_page != fs_page) {
@@ -2115,7 +2052,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
        struct nilfs_sb_info *sbi = sci->sc_sbi;
        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct page *failed_page;
-        int err, has_sr = 0;
+        int err;
        sci->sc_stage.scnt = NILFS_ST_INIT;
@@ -2143,8 +2080,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                if (unlikely(err))
                        goto failed;
-                has_sr = (sci->sc_super_root != NULL);
                /* Avoid empty segment */
                if (sci->sc_stage.scnt == NILFS_ST_DONE &&
                    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
@@ -2159,7 +2094,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
                        nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
-                if (has_sr) {
+                if (mode == SC_LSEG_SR &&
+                    sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
                        err = nilfs_segctor_fill_in_checkpoint(sci);
                        if (unlikely(err))
                                goto failed_to_write;
@@ -2171,11 +2107,12 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                /* Write partial segments */
                err = nilfs_segctor_prepare_write(sci, &failed_page);
                if (err) {
-                        nilfs_abort_logs(&sci->sc_segbufs, failed_page,
+                        nilfs_abort_logs(&sci->sc_segbufs, failed_page, err);
-                                         sci->sc_super_root, err);
                        goto failed_to_write;
                }
-                nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
+                nilfs_add_checksums_on_logs(&sci->sc_segbufs,
+                                            nilfs->ns_crc_seed);
                err = nilfs_segctor_write(sci, nilfs);
                if (unlikely(err))
@@ -2196,8 +2133,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                }
        } while (sci->sc_stage.scnt != NILFS_ST_DONE);
-        sci->sc_super_root = NULL;
 out:
        nilfs_segctor_check_out_files(sci, sbi);
        return err;
@@ -2214,7 +2149,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 }
 /**
- * nilfs_secgtor_start_timer - set timer of background write
+ * nilfs_segctor_start_timer - set timer of background write
 * @sci: nilfs_sc_info
 *
 * If the timer has already been set, it ignores the new request.
@@ -2224,9 +2159,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
 {
        spin_lock(&sci->sc_state_lock);
-        if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
+        if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
-                sci->sc_timer->expires = jiffies + sci->sc_interval;
+                sci->sc_timer.expires = jiffies + sci->sc_interval;
-                add_timer(sci->sc_timer);
+                add_timer(&sci->sc_timer);
                sci->sc_state |= NILFS_SEGCTOR_COMMIT;
        }
        spin_unlock(&sci->sc_state_lock);
@@ -2431,9 +2366,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
        spin_lock(&sci->sc_state_lock);
        sci->sc_seq_accepted = sci->sc_seq_request;
        spin_unlock(&sci->sc_state_lock);
+        del_timer_sync(&sci->sc_timer);
-        if (sci->sc_timer)
-                del_timer_sync(sci->sc_timer);
 }
 /**
@@ -2459,9 +2392,9 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
                        sci->sc_flush_request &= ~FLUSH_DAT_BIT;
                /* re-enable timer if checkpoint creation was not done */
-                if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+                if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
-                    time_before(jiffies, sci->sc_timer->expires))
+                    time_before(jiffies, sci->sc_timer.expires))
-                        add_timer(sci->sc_timer);
+                        add_timer(&sci->sc_timer);
        }
        spin_unlock(&sci->sc_state_lock);
 }
@@ -2640,13 +2573,10 @@ static int nilfs_segctor_thread(void *arg)
 {
        struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
-        struct timer_list timer;
        int timeout = 0;
-        init_timer(&timer);
+        sci->sc_timer.data = (unsigned long)current;
-        timer.data = (unsigned long)current;
+        sci->sc_timer.function = nilfs_construction_timeout;
-        timer.function = nilfs_construction_timeout;
-        sci->sc_timer = &timer;
        /* start sync. */
        sci->sc_task = current;
@@ -2695,7 +2625,7 @@ static int nilfs_segctor_thread(void *arg)
                        should_sleep = 0;
                else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
                        should_sleep = time_before(jiffies,
-                                                   sci->sc_timer->expires);
+                                        sci->sc_timer.expires);
                if (should_sleep) {
                        spin_unlock(&sci->sc_state_lock);
@@ -2704,7 +2634,7 @@ static int nilfs_segctor_thread(void *arg)
                }
                finish_wait(&sci->sc_wait_daemon, &wait);
                timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
-                           time_after_eq(jiffies, sci->sc_timer->expires));
+                           time_after_eq(jiffies, sci->sc_timer.expires));
                if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
                        set_nilfs_discontinued(nilfs);
@@ -2713,8 +2643,6 @@ static int nilfs_segctor_thread(void *arg)
 end_thread:
        spin_unlock(&sci->sc_state_lock);
-        del_timer_sync(sci->sc_timer);
-        sci->sc_timer = NULL;
        /* end sync. */
        sci->sc_task = NULL;
@@ -2750,13 +2678,6 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
        }
 }
-static int nilfs_segctor_init(struct nilfs_sc_info *sci)
-{
-        sci->sc_seq_done = sci->sc_seq_request;
-        return nilfs_segctor_start_thread(sci);
-}
 /*
 * Setup & clean-up functions
 */
@@ -2780,6 +2701,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
        INIT_LIST_HEAD(&sci->sc_write_logs);
        INIT_LIST_HEAD(&sci->sc_gc_inodes);
        INIT_LIST_HEAD(&sci->sc_copied_buffers);
+        init_timer(&sci->sc_timer);
        sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
        sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
@@ -2846,6 +2768,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
        down_write(&sbi->s_nilfs->ns_segctor_sem);
+        del_timer_sync(&sci->sc_timer);
        kfree(sci);
 }
@@ -2854,7 +2777,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 * @sbi: nilfs_sb_info
 *
 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
- * initilizes it, and starts the segment constructor.
+ * initializes it, and starts the segment constructor.
 *
 * Return Value: On success, 0 is returned. On error, one of the following
 * negative error code is returned.
@@ -2880,7 +2803,7 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
                return -ENOMEM;
        nilfs_attach_writer(nilfs, sbi);
-        err = nilfs_segctor_init(NILFS_SC(sbi));
+        err = nilfs_segctor_start_thread(NILFS_SC(sbi));
        if (err) {
                nilfs_detach_writer(nilfs, sbi);
                kfree(sbi->s_sc_info);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 3155e0c7f415..dca142361ccf 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -30,7 +30,7 @@
 #include "sb.h"
 /**
- * struct nilfs_recovery_info - Recovery infomation
+ * struct nilfs_recovery_info - Recovery information
 * @ri_need_recovery: Recovery status
 * @ri_super_root: Block number of the last super root
 * @ri_ri_cno: Number of the last checkpoint
@@ -71,7 +71,7 @@ struct nilfs_recovery_info {
 */
 struct nilfs_cstage {
        int                     scnt;
-        unsigned                flags;
+        unsigned                flags;
        struct nilfs_inode_info *dirty_file_ptr;
        struct nilfs_inode_info *gc_inode_ptr;
 };
@@ -100,7 +100,6 @@ struct nilfs_segsum_pointer {
 * @sc_write_logs: List of segment buffers to hold logs under writing
 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
 * @sc_curseg: Current segment buffer
- * @sc_super_root: Pointer to the super root buffer
 * @sc_stage: Collection stage
 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
@@ -148,7 +147,6 @@ struct nilfs_sc_info {
        struct list_head        sc_write_logs;
        unsigned long           sc_segbuf_nblocks;
        struct nilfs_segment_buffer *sc_curseg;
-        struct buffer_head     *sc_super_root;
        struct nilfs_cstage     sc_stage;
@@ -179,7 +177,7 @@ struct nilfs_sc_info {
        unsigned long           sc_lseg_stime;  /* in 1/HZ seconds */
        unsigned long           sc_watermark;
-        struct timer_list      *sc_timer;
+        struct timer_list       sc_timer;
        struct task_struct     *sc_task;
 };
@@ -219,6 +217,8 @@ enum {
 */
 #define NILFS_SC_DEFAULT_WATERMARK  3600
+/* super.c */
+extern struct kmem_cache *nilfs_transaction_cachep;
 /* segment.c */
 extern int nilfs_init_transaction_cache(void);
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index b6c36d0cc331..3c6cc6005c2e 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,7 +18,7 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * Written by Koji Sato <koji@osrg.net>.
- * Rivised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
 */
 #include <linux/kernel.h>
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 92579cc4c935..03b34b738993 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -67,6 +67,11 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
 MODULE_LICENSE("GPL");
+struct kmem_cache *nilfs_inode_cachep;
+struct kmem_cache *nilfs_transaction_cachep;
+struct kmem_cache *nilfs_segbuf_cachep;
+struct kmem_cache *nilfs_btree_path_cache;
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 /**
@@ -129,7 +134,6 @@ void nilfs_warning(struct super_block *sb, const char *function,
        va_end(args);
 }
-static struct kmem_cache *nilfs_inode_cachep;
 struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
 {
@@ -155,34 +159,6 @@ void nilfs_destroy_inode(struct inode *inode)
        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
-static void init_once(void *obj)
-{
-        struct nilfs_inode_info *ii = obj;
-        INIT_LIST_HEAD(&ii->i_dirty);
-#ifdef CONFIG_NILFS_XATTR
-        init_rwsem(&ii->xattr_sem);
-#endif
-        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
-        ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
-        inode_init_once(&ii->vfs_inode);
-}
-static int nilfs_init_inode_cache(void)
-{
-        nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
-                                               sizeof(struct nilfs_inode_info),
-                                               0, SLAB_RECLAIM_ACCOUNT,
-                                               init_once);
-        return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
-}
-static inline void nilfs_destroy_inode_cache(void)
-{
-        kmem_cache_destroy(nilfs_inode_cachep);
-}
 static void nilfs_clear_inode(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -266,8 +242,8 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
        int err;
        /* nilfs->sem must be locked by the caller. */
-        if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
+        if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
-                if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
+                if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC))
                        nilfs_swap_super_block(nilfs);
                else {
                        printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
@@ -436,7 +412,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        /*
         * Compute the overhead
         *
-         * When distributing meta data blocks outside semgent structure,
+         * When distributing meta data blocks outside segment structure,
         * We must count them as the overhead.
         */
        overhead = 0;
@@ -470,10 +446,10 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (nilfs_test_opt(sbi, SNAPSHOT))
                seq_printf(seq, ",cp=%llu",
                           (unsigned long long int)sbi->s_snapshot_cno);
-        if (nilfs_test_opt(sbi, ERRORS_RO))
-                seq_printf(seq, ",errors=remount-ro");
        if (nilfs_test_opt(sbi, ERRORS_PANIC))
                seq_printf(seq, ",errors=panic");
+        if (nilfs_test_opt(sbi, ERRORS_CONT))
+                seq_printf(seq, ",errors=continue");
        if (nilfs_test_opt(sbi, STRICT_ORDER))
                seq_printf(seq, ",order=strict");
        if (nilfs_test_opt(sbi, NORECOVERY))
@@ -631,7 +607,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
                          struct nilfs_super_block *sbp)
 {
        sbi->s_mount_opt =
-                NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
+                NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
 static int nilfs_setup_super(struct nilfs_sb_info *sbi)
@@ -749,6 +725,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        sb->s_export_op = &nilfs_export_ops;
        sb->s_root = NULL;
        sb->s_time_gran = 1;
+        sb->s_bdi = nilfs->ns_bdi;
        err = load_nilfs(nilfs, sbi);
        if (err)
@@ -777,9 +754,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
                                goto failed_sbi;
                        }
                        cno = sbi->s_snapshot_cno;
-                } else
+                }
-                        /* Read-only mount */
-                        sbi->s_snapshot_cno = cno;
        }
        err = nilfs_attach_checkpoint(sbi, cno);
@@ -848,7 +823,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        struct the_nilfs *nilfs = sbi->s_nilfs;
        unsigned long old_sb_flags;
        struct nilfs_mount_options old_opts;
-        int err;
+        int was_snapshot, err;
        lock_kernel();
@@ -856,6 +831,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        old_sb_flags = sb->s_flags;
        old_opts.mount_opt = sbi->s_mount_opt;
        old_opts.snapshot_cno = sbi->s_snapshot_cno;
+        was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
        if (!parse_options(data, sb)) {
                err = -EINVAL;
@@ -863,20 +839,32 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        }
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
-        if ((*flags & MS_RDONLY) &&
+        err = -EINVAL;
-            sbi->s_snapshot_cno != old_opts.snapshot_cno) {
+        if (was_snapshot) {
-                printk(KERN_WARNING "NILFS (device %s): couldn't "
+                if (!(*flags & MS_RDONLY)) {
-                       "remount to a different snapshot. \n",
+                        printk(KERN_ERR "NILFS (device %s): cannot remount "
-                       sb->s_id);
+                               "snapshot read/write.\n",
-                err = -EINVAL;
+                               sb->s_id);
-                goto restore_opts;
+                        goto restore_opts;
+                } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) {
+                        printk(KERN_ERR "NILFS (device %s): cannot "
+                               "remount to a different snapshot.\n",
+                               sb->s_id);
+                        goto restore_opts;
+                }
+        } else {
+                if (nilfs_test_opt(sbi, SNAPSHOT)) {
+                        printk(KERN_ERR "NILFS (device %s): cannot change "
+                               "a regular mount to a snapshot.\n",
+                               sb->s_id);
+                        goto restore_opts;
+                }
        }
        if (!nilfs_valid_fs(nilfs)) {
                printk(KERN_WARNING "NILFS (device %s): couldn't "
                       "remount because the filesystem is in an "
                       "incomplete recovery state.\n", sb->s_id);
-                err = -EINVAL;
                goto restore_opts;
        }
@@ -887,9 +875,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                nilfs_detach_segment_constructor(sbi);
                sb->s_flags |= MS_RDONLY;
-                sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
-                /* nilfs_set_opt(sbi, SNAPSHOT); */
                /*
                 * Remounting a valid RW partition RDONLY, so set
                 * the RDONLY flag and then mark the partition as valid again.
@@ -908,24 +893,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                 * store the current valid flag.  (It may have been changed
                 * by fsck since we originally mounted the partition.)
                 */
-                if (nilfs->ns_current && nilfs->ns_current != sbi) {
-                        printk(KERN_WARNING "NILFS (device %s): couldn't "
-                               "remount because an RW-mount exists.\n",
-                               sb->s_id);
-                        err = -EBUSY;
-                        goto restore_opts;
-                }
-                if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
-                        printk(KERN_WARNING "NILFS (device %s): couldn't "
-                               "remount because the current RO-mount is not "
-                               "the latest one.\n",
-                               sb->s_id);
-                        err = -EINVAL;
-                        goto restore_opts;
-                }
                sb->s_flags &= ~MS_RDONLY;
-                nilfs_clear_opt(sbi, SNAPSHOT);
-                sbi->s_snapshot_cno = 0;
                err = nilfs_attach_segment_constructor(sbi);
                if (err)
@@ -934,8 +902,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                down_write(&nilfs->ns_sem);
                nilfs_setup_super(sbi);
                up_write(&nilfs->ns_sem);
-                nilfs->ns_current = sbi;
        }
 out:
        up_write(&nilfs->ns_super_sem);
@@ -1021,10 +987,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 {
        struct nilfs_super_data sd;
        struct super_block *s;
+        fmode_t mode = FMODE_READ;
        struct the_nilfs *nilfs;
        int err, need_to_close = 1;
-        sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
+        if (!(flags & MS_RDONLY))
+                mode |= FMODE_WRITE;
+        sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
        if (IS_ERR(sd.bdev))
                return PTR_ERR(sd.bdev);
@@ -1091,10 +1061,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
                /* New superblock instance created */
                s->s_flags = flags;
+                s->s_mode = mode;
                strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
                sb_set_blocksize(s, block_size(sd.bdev));
-                err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
+                err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
+                                       nilfs);
                if (err)
                        goto cancel_new;
@@ -1105,7 +1077,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        mutex_unlock(&nilfs->ns_mount_mutex);
        put_nilfs(nilfs);
        if (need_to_close)
-                close_bdev_exclusive(sd.bdev, flags);
+                close_bdev_exclusive(sd.bdev, mode);
        simple_set_mnt(mnt, s);
        return 0;
@@ -1113,7 +1085,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        mutex_unlock(&nilfs->ns_mount_mutex);
        put_nilfs(nilfs);
 failed:
-        close_bdev_exclusive(sd.bdev, flags);
+        close_bdev_exclusive(sd.bdev, mode);
        return err;
@@ -1123,7 +1095,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        put_nilfs(nilfs);
        deactivate_locked_super(s);
        /*
-         * deactivate_super() invokes close_bdev_exclusive().
+         * deactivate_locked_super() invokes close_bdev_exclusive().
         * We must finish all post-cleaning before this call;
         * put_nilfs() needs the block device.
         */
@@ -1138,54 +1110,93 @@ struct file_system_type nilfs_fs_type = {
        .fs_flags = FS_REQUIRES_DEV,
 };
-static int __init init_nilfs_fs(void)
+static void nilfs_inode_init_once(void *obj)
 {
-        int err;
+        struct nilfs_inode_info *ii = obj;
-        err = nilfs_init_inode_cache();
-        if (err)
-                goto failed;
-        err = nilfs_init_transaction_cache();
+        INIT_LIST_HEAD(&ii->i_dirty);
-        if (err)
+#ifdef CONFIG_NILFS_XATTR
-                goto failed_inode_cache;
+        init_rwsem(&ii->xattr_sem);
+#endif
+        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+        ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
+        inode_init_once(&ii->vfs_inode);
+}
-        err = nilfs_init_segbuf_cache();
+static void nilfs_segbuf_init_once(void *obj)
-        if (err)
+{
-                goto failed_transaction_cache;
+        memset(obj, 0, sizeof(struct nilfs_segment_buffer));
+}
-        err = nilfs_btree_path_cache_init();
+static void nilfs_destroy_cachep(void)
-        if (err)
+{
-                goto failed_segbuf_cache;
+         if (nilfs_inode_cachep)
+                kmem_cache_destroy(nilfs_inode_cachep);
+         if (nilfs_transaction_cachep)
+                kmem_cache_destroy(nilfs_transaction_cachep);
+         if (nilfs_segbuf_cachep)
+                kmem_cache_destroy(nilfs_segbuf_cachep);
+         if (nilfs_btree_path_cache)
+                kmem_cache_destroy(nilfs_btree_path_cache);
+}
-        err = register_filesystem(&nilfs_fs_type);
+static int __init nilfs_init_cachep(void)
-        if (err)
+{
-                goto failed_btree_path_cache;
+        nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
+                        sizeof(struct nilfs_inode_info), 0,
+                        SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+        if (!nilfs_inode_cachep)
+                goto fail;
+        nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache",
+                        sizeof(struct nilfs_transaction_info), 0,
+                        SLAB_RECLAIM_ACCOUNT, NULL);
+        if (!nilfs_transaction_cachep)
+                goto fail;
+        nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache",
+                        sizeof(struct nilfs_segment_buffer), 0,
+                        SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once);
+        if (!nilfs_segbuf_cachep)
+                goto fail;
+        nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache",
+                        sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX,
+                        0, 0, NULL);
+        if (!nilfs_btree_path_cache)
+                goto fail;
        return 0;
- failed_btree_path_cache:
+fail:
-        nilfs_btree_path_cache_destroy();
+        nilfs_destroy_cachep();
+        return -ENOMEM;
+}
+static int __init init_nilfs_fs(void)
+{
+        int err;
- failed_segbuf_cache:
+        err = nilfs_init_cachep();
-        nilfs_destroy_segbuf_cache();
+        if (err)
+                goto fail;
- failed_transaction_cache:
+        err = register_filesystem(&nilfs_fs_type);
-        nilfs_destroy_transaction_cache();
+        if (err)
+                goto free_cachep;
- failed_inode_cache:
+        printk(KERN_INFO "NILFS version 2 loaded\n");
-        nilfs_destroy_inode_cache();
+        return 0;
- failed:
+free_cachep:
+        nilfs_destroy_cachep();
+fail:
        return err;
 }
 static void __exit exit_nilfs_fs(void)
 {
-        nilfs_destroy_segbuf_cache();
+        nilfs_destroy_cachep();
-        nilfs_destroy_transaction_cache();
-        nilfs_destroy_inode_cache();
-        nilfs_btree_path_cache_destroy();
        unregister_filesystem(&nilfs_fs_type);
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 92733d5651d2..a756168a21c2 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -386,7 +386,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
        nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
        if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
-                printk(KERN_ERR "NILFS: too short segment. \n");
+                printk(KERN_ERR "NILFS: too short segment.\n");
                return -EINVAL;
        }
@@ -486,11 +486,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
                printk(KERN_WARNING
                       "NILFS warning: unable to read secondary superblock\n");
+        /*
+         * Compare two super blocks and set 1 in swp if the secondary
+         * super block is valid and newer.  Otherwise, set 0 in swp.
+         */
        valid[0] = nilfs_valid_sb(sbp[0]);
        valid[1] = nilfs_valid_sb(sbp[1]);
-        swp = valid[1] &&
+        swp = valid[1] && (!valid[0] ||
-                (!valid[0] ||
+                           le64_to_cpu(sbp[1]->s_last_cno) >
-                 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
+                           le64_to_cpu(sbp[0]->s_last_cno));
        if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
                brelse(sbh[1]);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index e9795f1724d7..1ab974533697 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -29,6 +29,7 @@
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/slab.h>
 #include "sb.h"
 /* the_nilfs struct */
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 037e878e03fc..fcc2f064af83 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -18,6 +18,7 @@
 #include <linux/dcache.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/srcu.h>
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3165d85aada2..0399bcbe09c8 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -87,7 +87,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/writeback.h> /* for inode_lock */
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 3e56dbffe729..b3a159b21cfd 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,6 +15,7 @@ config INOTIFY
 config INOTIFY_USER
        bool "Inotify support for userspace"
+        select ANON_INODES
        select FSNOTIFY
        default y
        ---help---
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1afb0a10229f..e27960cd76ab 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -28,6 +28,7 @@
 #include <linux/path.h> /* struct path */
 #include <linux/slab.h> /* kmem_* */
 #include <linux/types.h>
+#include <linux/sched.h>
 #include "inotify.h"
@@ -146,6 +147,7 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
        idr_for_each(&group->inotify_data.idr, idr_callback, group);
        idr_remove_all(&group->inotify_data.idr);
        idr_destroy(&group->inotify_data.idr);
+        free_uid(group->inotify_data.user);
 }
 void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 472cdf29ef82..e46ca685b9be 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -546,21 +546,24 @@ retry:
        if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
                goto out_err;
+        /* we are putting the mark on the idr, take a reference */
+        fsnotify_get_mark(&tmp_ientry->fsn_entry);
        spin_lock(&group->inotify_data.idr_lock);
        ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
                                group->inotify_data.last_wd+1,
                                &tmp_ientry->wd);
        spin_unlock(&group->inotify_data.idr_lock);
        if (ret) {
+                /* we didn't get on the idr, drop the idr reference */
+                fsnotify_put_mark(&tmp_ientry->fsn_entry);
                /* idr was out of memory allocate and try again */
                if (ret == -EAGAIN)
                        goto retry;
                goto out_err;
        }
-        /* we put the mark on the idr, take a reference */
-        fsnotify_get_mark(&tmp_ientry->fsn_entry);
        /* we are on the idr, now get on the inode */
        ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
        if (ret) {
@@ -578,16 +581,13 @@ retry:
        /* return the watch descriptor for this new entry */
        ret = tmp_ientry->wd;
-        /* match the ref from fsnotify_init_markentry() */
-        fsnotify_put_mark(&tmp_ientry->fsn_entry);
        /* if this mark added a new event update the group mask */
        if (mask & ~group->mask)
                fsnotify_recalc_group_mask(group);
 out_err:
-        if (ret < 0)
+        /* match the ref from fsnotify_init_markentry() */
-                kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
+        fsnotify_put_mark(&tmp_ientry->fsn_entry);
        return ret;
 }
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
deleted file mode 100644
index 37c11e194372..000000000000
--- a/fs/ntfs/ChangeLog
+++ /dev/null
@@ -1,1702 +0,0 @@
-ToDo/Notes:
-        - Find and fix bugs.
-        - The only places in the kernel where a file is resized are
-          ntfs_file_write*() and ntfs_truncate() for both of which i_mutex is
-          held.  Just have to be careful in read-/writepage and other helpers
-          not running under i_mutex that we play nice.  Also need to be careful
-          with initialized_size extension in ntfs_file_write*() and writepage.
-          UPDATE: The only things that need to be checked are the compressed
-          write and the other attribute resize/write cases like index
-          attributes, etc.  For now none of these are implemented so are safe.
-        - Implement filling in of holes in aops.c::ntfs_writepage() and its
-          helpers.
-        - Implement mft.c::sync_mft_mirror_umount().  We currently will just
-          leave the volume dirty on umount if the final iput(vol->mft_ino)
-          causes a write of any mirrored mft records due to the mft mirror
-          inode having been discarded already.  Whether this can actually ever
-          happen is unclear however so it is worth waiting until someone hits
-          the problem.
-2.1.29 - Fix a deadlock at mount time.
-        - During mount the VFS holds s_umount lock on the superblock.  So when
-          we try to empty the journal $LogFile contents by calling
-          ntfs_attr_set() when the machine does not have much memory and the
-          journal is large ntfs_attr_set() results in the VM trying to balance
-          dirty pages which in turn tries to that the s_umount lock and thus we
-          get a deadlock.  The solution is to not use ntfs_attr_set() and
-          instead do the zeroing by hand at the block level rather than page
-          cache level.
-        - Fix sparse warnings.
-2.1.28 - Fix a deadlock.
-        - Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode().  Thanks to Sergey
-          Vlasov for the report and detailed analysis of the deadlock.  The fix
-          involved getting rid of ntfs_put_inode() altogether and hence NTFS no
-          longer has a ->put_inode super operation.
-2.1.27 - Various bug fixes and cleanups.
-        - Fix two compiler warnings on Alpha.  Thanks to Andrew Morton for
-          reporting them.
-        - Fix an (innocent) off-by-one error in the runlist code.
-        - Fix a buggette in an "should be impossible" case handling where we
-          continued the attribute lookup loop instead of aborting it.
-        - Use buffer_migrate_page() for the ->migratepage function of all ntfs
-          address space operations.
-        - Fix comparison of $MFT and $MFTMirr to not bail out when there are
-          unused, invalid mft records which are the same in both $MFT and
-          $MFTMirr.
-        - Add support for sparse files which have a compression unit of 0.
-        - Remove all the make_bad_inode() calls.  This should only be called
-          from read inode and new inode code paths.
-        - Limit name length in fs/ntfs/unistr.c::ntfs_nlstoucs() to maximum
-          allowed by NTFS, i.e. 255 Unicode characters, not including the
-          terminating NULL (which is not stored on disk).
-        - Improve comments on file attribute flags in fs/ntfs/layout.h.
-        - Fix a bug in fs/ntfs/inode.c::ntfs_read_locked_index_inode() where we
-          forgot to update a temporary variable so loading index inodes which
-          have an index allocation attribute failed.
-        - Add a missing call to flush_dcache_mft_record_page() in
-          fs/ntfs/inode.c::ntfs_write_inode().
-        - Handle the recently introduced -ENAMETOOLONG return value from
-          fs/ntfs/unistr.c::ntfs_nlstoucs() in fs/ntfs/namei.c::ntfs_lookup().
-        - Semaphore to mutex conversion.  (Ingo Molnar)
-2.1.26 - Minor bug fixes and updates.
-        - Fix a potential overflow in file.c where a cast to s64 was missing in
-          a left shift of a page index.
-        - The struct inode has had its i_sem semaphore changed to a mutex named
-          i_mutex.
-        - We have struct kmem_cache now so use it instead of the typedef
-          kmem_cache_t.  (Pekka Enberg)
-        - Implement support for sector sizes above 512 bytes (up to the maximum
-          supported by NTFS which is 4096 bytes).
-        - Do more detailed reporting of why we cannot mount read-write by
-          special casing the VOLUME_MODIFIED_BY_CHKDSK flag.
-        - Miscellaneous updates to layout.h.
-        - Cope with attribute list attribute having invalid flags.  Windows
-          copes with this and even chkdsk does not detect or fix this so we
-          have to cope with it, too.  Thanks to Pawel Kot for reporting the
-          problem.
-2.1.25 - (Almost) fully implement write(2) and truncate(2).
-        - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
-          {__,}ntfs_cluster_free() to also take an optional attribute search
-          context as argument.  This allows calling these functions with the
-          mft record mapped.  Update all callers.
-        - Fix potential deadlock in ntfs_mft_data_extend_allocation_nolock()
-          error handling by passing in the active search context when calling
-          ntfs_cluster_free().
-        - Change ntfs_cluster_alloc() to take an extra boolean parameter
-          specifying whether the cluster are being allocated to extend an
-          attribute or to fill a hole.
-        - Change ntfs_attr_make_non_resident() to call ntfs_cluster_alloc()
-          with @is_extension set to TRUE and remove the runlist terminator
-          fixup code as this is now done by ntfs_cluster_alloc().
-        - Change ntfs_attr_make_non_resident to take the attribute value size
-          as an extra parameter.  This is needed since we need to know the size
-          before we can map the mft record and our callers always know it.  The
-          reason we cannot simply read the size from the vfs inode i_size is
-          that this is not necessarily uptodate.  This happens when
-          ntfs_attr_make_non_resident() is called in the ->truncate call path.
-        - Fix ntfs_attr_make_non_resident() to update the vfs inode i_blocks
-          which is zero for a resident attribute but should no longer be zero
-          once the attribute is non-resident as it then has real clusters
-          allocated.
-        - Add fs/ntfs/attrib.[hc]::ntfs_attr_extend_allocation(), a function to
-          extend the allocation of an attributes.  Optionally, the data size,
-          but not the initialized size can be extended, too.
-        - Implement fs/ntfs/inode.[hc]::ntfs_truncate().  It only supports
-          uncompressed and unencrypted files and it never creates sparse files
-          at least for the moment (making a file sparse requires us to modify
-          its directory entries and we do not support directory operations at
-          the moment).  Also, support for highly fragmented files, i.e. ones
-          whose data attribute is split across multiple extents, is severly
-          limited.  When such a case is encountered, EOPNOTSUPP is returned.
-        - Enable ATTR_SIZE attribute changes in ntfs_setattr().  This completes
-          the initial implementation of file truncation.  Now both open(2)ing
-          a file with the O_TRUNC flag and the {,f}truncate(2) system calls
-          will resize a file appropriately.  The limitations are that only
-          uncompressed and unencrypted files are supported.  Also, there is
-          only very limited support for highly fragmented files (the ones whose
-          $DATA attribute is split into multiple attribute extents).
-        - In attrib.c::ntfs_attr_set() call balance_dirty_pages_ratelimited()
-          and cond_resched() in the main loop as we could be dirtying a lot of
-          pages and this ensures we play nice with the VM and the system as a
-          whole.
-        - Implement file operations ->write, ->aio_write, ->writev for regular
-          files.  This replaces the old use of generic_file_write(), et al and
-          the address space operations ->prepare_write and ->commit_write.
-          This means that both sparse and non-sparse (unencrypted and
-          uncompressed) files can now be extended using the normal write(2)
-          code path.  There are two limitations at present and these are that
-          we never create sparse files and that we only have limited support
-          for highly fragmented files, i.e. ones whose data attribute is split
-          across multiple extents.   When such a case is encountered,
-          EOPNOTSUPP is returned.
-        - $EA attributes can be both resident and non-resident.
-        - Use %z for size_t to fix compilation warnings.  (Andrew Morton)
-        - Fix compilation warnings with gcc-4.0.2 on SUSE 10.0.
-        - Document extended attribute ($EA) NEED_EA flag.  (Based on libntfs
-          patch by Yura Pakhuchiy.)
-2.1.24 - Lots of bug fixes and support more clean journal states.
-        - Support journals ($LogFile) which have been modified by chkdsk.  This
-          means users can boot into Windows after we marked the volume dirty.
-          The Windows boot will run chkdsk and then reboot.  The user can then
-          immediately boot into Linux rather than having to do a full Windows
-          boot first before rebooting into Linux and we will recognize such a
-          journal and empty it as it is clean by definition.  Note, this only
-          works if chkdsk left the journal in an obviously clean state.
-        - Support journals ($LogFile) with only one restart page as well as
-          journals with two different restart pages.  We sanity check both and
-          either use the only sane one or the more recent one of the two in the
-          case that both are valid.
-        - Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
-          ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
-          hence cannot fail.
-        - Use ntfs_malloc_nofs_nofail() in the two critical regions in
-          fs/ntfs/runlist.c::ntfs_runlists_merge().  This means we no longer
-          need to panic() if the allocation fails as it now cannot fail.
-        - Fix two nasty runlist merging bugs that had gone unnoticed so far.
-          Thanks to Stefano Picerno for the bug report.
-        - Remove two bogus BUG_ON()s from fs/ntfs/mft.c.
-        - Fix handling of valid but empty mapping pairs array in
-          fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress().
-        - Report unrepresentable inodes during ntfs_readdir() as KERN_WARNING
-          messages and include the inode number.  Thanks to Yura Pakhuchiy for
-          pointing this out.
-        - Change ntfs_rl_truncate_nolock() to throw away the runlist if the new
-          length is zero.
-        - Add runlist.[hc]::ntfs_rl_punch_nolock() which punches a caller
-          specified hole into a runlist.
-        - Fix a bug in fs/ntfs/index.c::ntfs_index_lookup().  When the returned
-          index entry is in the index root, we forgot to set the @ir pointer in
-          the index context.  Thanks to Yura Pakhuchiy for finding this bug.
-        - Remove bogus setting of PageError in ntfs_read_compressed_block().
-        - Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize().
-        - Fix a bug in ntfs_map_runlist_nolock() where we forgot to protect
-          access to the allocated size in the ntfs inode with the size lock.
-        - Fix ntfs_attr_vcn_to_lcn_nolock() and ntfs_attr_find_vcn_nolock() to
-          return LCN_ENOENT when there is no runlist and the allocated size is
-          zero.
-        - Fix load_attribute_list() to handle the case of a NULL runlist.
-        - Fix handling of sparse attributes in ntfs_attr_make_non_resident().
-        - Add BUG() checks to ntfs_attr_make_non_resident() and ntfs_attr_set()
-          to ensure that these functions are never called for compressed or
-          encrypted attributes.
-        - Fix cluster (de)allocators to work when the runlist is NULL and more
-          importantly to take a locked runlist rather than them locking it
-          which leads to lock reversal.
-        - Truncate {a,c,m}time to the ntfs supported time granularity when
-          updating the times in the inode in ntfs_setattr().
-        - Fixup handling of sparse, compressed, and encrypted attributes in
-          fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode(),
-          fs/ntfs/aops.c::ntfs_{read,write}page().
-        - Make ntfs_write_block() not instantiate sparse blocks if they contain
-          only zeroes.
-        - Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page
-          lock protection over the buffer submission for i/o which allows the
-          removal of the get_bh()/put_bh() pairs for each buffer.
-        - Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case
-          where a concurrent truncate has truncated the runlist under our feet.
-        - Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c.
-        - In fs/ntfs/aops.c::ntfs_end_buffer_async_read(), use a bit spin lock
-          in the first buffer head instead of a driver global spin lock to
-          improve scalability.
-        - Minor fix to error handling and error message display in
-          fs/ntfs/aops.c::ntfs_prepare_nonresident_write().
-        - Change the mount options {u,f,d}mask to always parse the number as
-          an octal number to conform to how chmod(1) works, too.  Thanks to
-          Giuseppe Bilotta and Horst von Brand for pointing out the errors of
-          my ways.
-        - Fix various bugs in the runlist merging code.  (Based on libntfs
-          changes by Richard Russon.)
-        - Fix sparse warnings that have crept in over time.
-        - Change ntfs_cluster_free() to require a write locked runlist on entry
-          since we otherwise get into a lock reversal deadlock if a read locked
-          runlist is passed in. In the process also change it to take an ntfs
-          inode instead of a vfs inode as parameter.
-        - Fix the definition of the CHKD ntfs record magic.  It had an off by
-          two error causing it to be CHKB instead of CHKD.
-        - Fix a stupid bug in __ntfs_bitmap_set_bits_in_run() which caused the
-          count to become negative and hence we had a wild memset() scribbling
-          all over the system's ram.
-2.1.23 - Implement extension of resident files and make writing safe as well as
-         many bug fixes, cleanups, and enhancements...
-        - Add printk rate limiting for ntfs_warning() and ntfs_error() when
-          compiled without debug.  This avoids a possible denial of service
-          attack.  Thanks to Carl-Daniel Hailfinger from SuSE for pointing this
-          out.
-        - Fix compilation warnings on ia64.  (Randy Dunlap)
-        - Use i_size_{read,write}() instead of reading i_size by hand and cache
-          the value where apropriate.
-        - Add size_lock to the ntfs_inode structure.  This is an rw spinlock
-          and it locks against access to the inode sizes.  Note, ->size_lock
-          is also accessed from irq context so you must use the _irqsave and
-          _irqrestore lock and unlock functions, respectively.  Protect all
-          accesses to allocated_size, initialized_size, and compressed_size.
-        - Minor optimization to fs/ntfs/super.c::ntfs_statfs() and its helpers.
-        - Implement extension of resident files in the regular file write code
-          paths (fs/ntfs/aops.c::ntfs_{prepare,commit}_write()).  At present
-          this only works until the data attribute becomes too big for the mft
-          record after which we abort the write returning -EOPNOTSUPP from
-          ntfs_prepare_write().
-        - Add disable_sparse mount option together with a per volume sparse
-          enable bit which is set appropriately and a per inode sparse disable
-          bit which is preset on some system file inodes as appropriate.
-        - Enforce that sparse support is disabled on NTFS volumes pre 3.0.
-        - Fix a bug in fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress() in
-          the creation of the unmapped runlist element for the base attribute
-          extent.
-        - Split ntfs_map_runlist() into ntfs_map_runlist() and a non-locking
-          helper ntfs_map_runlist_nolock() which is used by ntfs_map_runlist().
-          This allows us to map runlist fragments with the runlist lock already
-          held without having to drop and reacquire it around the call.  Adapt
-          all callers.
-        - Change ntfs_find_vcn() to ntfs_find_vcn_nolock() which takes a locked
-          runlist.  This allows us to find runlist elements with the runlist
-          lock already held without having to drop and reacquire it around the
-          call.  Adapt all callers.
-        - Change time to u64 in time.h::ntfs2utc() as it otherwise generates a
-          warning in the do_div() call on sparc32.  Thanks to Meelis Roos for
-          the report and analysis of the warning.
-        - Fix a nasty runlist merge bug when merging two holes.
-        - Set the ntfs_inode->allocated_size to the real allocated size in the
-          mft record for resident attributes (fs/ntfs/inode.c).
-        - Small readability cleanup to use "a" instead of "ctx->attr"
-          everywhere (fs/ntfs/inode.c).
-        - Make fs/ntfs/namei.c::ntfs_get_{parent,dentry} static and move the
-          definition of ntfs_export_ops from fs/ntfs/super.c to namei.c.  Also,
-          declare ntfs_export_ops in fs/ntfs/ntfs.h.
-        - Correct sparse file handling.  The compressed values need to be
-          checked and set in the ntfs inode as done for compressed files and
-          the compressed size needs to be used for vfs inode->i_blocks instead
-          of the allocated size, again, as done for compressed files.
-        - Add AT_EA in addition to AT_DATA to whitelist for being allowed to be
-          non-resident in fs/ntfs/attrib.c::ntfs_attr_can_be_non_resident().
-        - Add fs/ntfs/attrib.c::ntfs_attr_vcn_to_lcn_nolock() used by the new
-          write code.
-        - Fix bug in fs/ntfs/attrib.c::ntfs_find_vcn_nolock() where after
-          dropping the read lock and taking the write lock we were not checking
-          whether someone else did not already do the work we wanted to do.
-        - Rename fs/ntfs/attrib.c::ntfs_find_vcn_nolock() to
-          ntfs_attr_find_vcn_nolock() and update all callers.
-        - Add fs/ntfs/attrib.[hc]::ntfs_attr_make_non_resident().
-        - Fix sign of various error return values to be negative in
-          fs/ntfs/lcnalloc.c.
-        - Modify ->readpage and ->writepage (fs/ntfs/aops.c) so they detect and
-          handle the case where an attribute is converted from resident to
-          non-resident by a concurrent file write.
-        - Remove checks for NULL before calling kfree() since kfree() does the
-          checking itself.  (Jesper Juhl)
-        - Some utilities modify the boot sector but do not update the checksum.
-          Thus, relax the checking in fs/ntfs/super.c::is_boot_sector_ntfs() to
-          only emit a warning when the checksum is incorrect rather than
-          refusing the mount.  Thanks to Bernd Casimir for pointing this
-          problem out.
-        - Update attribute definition handling.
-        - Add NTFS_MAX_CLUSTER_SIZE and NTFS_MAX_PAGES_PER_CLUSTER constants.
-        - Use NTFS_MAX_CLUSTER_SIZE in super.c instead of hard coding 0x10000.
-        - Use MAX_BUF_PER_PAGE instead of variable sized array allocation for
-          better code generation and one less sparse warning in fs/ntfs/aops.c.
-        - Remove spurious void pointer casts from fs/ntfs/.  (Pekka Enberg)
-        - Use C99 style structure initialization after memory allocation where
-          possible (fs/ntfs/{attrib.c,index.c,super.c}).  Thanks to Al Viro and
-          Pekka Enberg.
-        - Stamp the transaction log ($UsnJrnl), aka user space journal, if it
-          is active on the volume and we are mounting read-write or remounting
-          from read-only to read-write.
-        - Fix a bug in address space operations error recovery code paths where
-          if the runlist was not mapped at all and a mapping error occured we
-          would leave the runlist locked on exit to the function so that the
-          next access to the same file would try to take the lock and deadlock.
-        - Detect the case when Windows has been suspended to disk on the volume
-          to be mounted and if this is the case do not allow (re)mounting
-          read-write.  This is done by parsing hiberfil.sys if present.
-        - Fix several occurences of a bug where we would perform 'var & ~const'
-          with a 64-bit variable and a int, i.e. 32-bit, constant.  This causes
-          the higher order 32-bits of the 64-bit variable to be zeroed.  To fix
-          this cast the 'const' to the same 64-bit type as 'var'.
-        - Change the runlist terminator of the newly allocated cluster(s) to
-          LCN_ENOENT in ntfs_attr_make_non_resident().  Otherwise the runlist
-          code gets confused.
-        - Add an extra parameter @last_vcn to ntfs_get_size_for_mapping_pairs()
-          and ntfs_mapping_pairs_build() to allow the runlist encoding to be
-          partial which is desirable when filling holes in sparse attributes.
-          Update all callers.
-        - Change ntfs_map_runlist_nolock() to only decompress the mapping pairs
-          if the requested vcn is inside it.  Otherwise we get into problems
-          when we try to map an out of bounds vcn because we then try to map
-          the already mapped runlist fragment which causes
-          ntfs_mapping_pairs_decompress() to fail and return error.  Update
-          ntfs_attr_find_vcn_nolock() accordingly.
-        - Fix a nasty deadlock that appeared in recent kernels.
-          The situation: VFS inode X on a mounted ntfs volume is dirty.  For
-          same inode X, the ntfs_inode is dirty and thus corresponding on-disk
-          inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging
-          to the table of inodes, i.e. $MFT, inode 0.
-          What happens:
-          Process 1: sys_sync()/umount()/whatever...  calls
-          __sync_single_inode() for $MFT -> do_writepages() -> write_page for
-          the dirty page containing the on-disk inode X, the page is now locked
-          -> ntfs_write_mst_block() which clears PageUptodate() on the page to
-          prevent anyone else getting hold of it whilst it does the write out.
-          This is necessary as the on-disk inode needs "fixups" applied before
-          the write to disk which are removed again after the write and
-          PageUptodate is then set again.  It then analyses the page looking
-          for dirty on-disk inodes and when it finds one it calls
-          ntfs_may_write_mft_record() to see if it is safe to write this
-          on-disk inode.  This then calls ilookup5() to check if the
-          corresponding VFS inode is in icache().  This in turn calls ifind()
-          which waits on the inode lock via wait_on_inode whilst holding the
-          global inode_lock.
-          Process 2: pdflush results in a call to __sync_single_inode for the
-          same VFS inode X on the ntfs volume.  This locks the inode (I_LOCK)
-          then calls write-inode -> ntfs_write_inode -> map_mft_record() ->
-          read_cache_page() for the page (in page cache of table of inodes
-          $MFT, inode 0) containing the on-disk inode.  This page has
-          PageUptodate() clear because of Process 1 (see above) so
-          read_cache_page() blocks when it tries to take the page lock for the
-          page so it can call ntfs_read_page().
-          Thus Process 1 is holding the page lock on the page containing the
-          on-disk inode X and it is waiting on the inode X to be unlocked in
-          ifind() so it can write the page out and then unlock the page.
-          And Process 2 is holding the inode lock on inode X and is waiting for
-          the page to be unlocked so it can call ntfs_readpage() or discover
-          that Process 1 set PageUptodate() again and use the page.
-          Thus we have a deadlock due to ifind() waiting on the inode lock.
-          The solution: The fix is to use the newly introduced
-          ilookup5_nowait() which does not wait on the inode's lock and hence
-          avoids the deadlock.  This is safe as we do not care about the VFS
-          inode and only use the fact that it is in the VFS inode cache and the
-          fact that the vfs and ntfs inodes are one struct in memory to find
-          the ntfs inode in memory if present.  Also, the ntfs inode has its
-          own locking so it does not matter if the vfs inode is locked.
-        - Fix bug in mft record writing where we forgot to set the device in
-          the buffers when mapping them after the VM had discarded them.
-          Thanks to Martin MOKREJÅ for the bug report.
-2.1.22 - Many bug and race fixes and error handling improvements.
-        - Improve error handling in fs/ntfs/inode.c::ntfs_truncate().
-        - Change fs/ntfs/inode.c::ntfs_truncate() to return an error code
-          instead of void and provide a helper ntfs_truncate_vfs() for the
-          vfs ->truncate method.
-        - Add a new ntfs inode flag NInoTruncateFailed() and modify
-          fs/ntfs/inode.c::ntfs_truncate() to set and clear it appropriately.
-        - Fix min_size and max_size definitions in ATTR_DEF structure in
-          fs/ntfs/layout.h to be signed.
-        - Add attribute definition handling helpers to fs/ntfs/attrib.[hc]:
-          ntfs_attr_size_bounds_check(), ntfs_attr_can_be_non_resident(), and
-          ntfs_attr_can_be_resident(), which in turn use the new private helper
-          ntfs_attr_find_in_attrdef().
-        - In fs/ntfs/aops.c::mark_ntfs_record_dirty(), take the
-          mapping->private_lock around the dirtying of the buffer heads
-          analagous to the way it is done in __set_page_dirty_buffers().
-        - Ensure the mft record size does not exceed the PAGE_CACHE_SIZE at
-          mount time as this cannot work with the current implementation.
-        - Check for location of attribute name and improve error handling in
-          general in fs/ntfs/inode.c::ntfs_read_locked_inode() and friends.
-        - In fs/ntfs/aops.c::ntfs_writepage(), if the page is fully outside
-          i_size, i.e. race with truncate, invalidate the buffers on the page
-          so that they become freeable and hence the page does not leak.
-        - Remove unused function fs/ntfs/runlist.c::ntfs_rl_merge().  (Adrian
-          Bunk)
-        - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_find() that resulted in
-          a NULL pointer dereference in the error code path when a corrupt
-          attribute was found.  (Thanks to Domen Puncer for the bug report.)
-        - Add MODULE_VERSION() to fs/ntfs/super.c.
-        - Make several functions and variables static.  (Adrian Bunk)
-        - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() so it allocates
-          buffers for the page if they are not present and then marks the
-          buffers belonging to the ntfs record dirty.  This causes the buffers
-          to become busy and hence they are safe from removal until the page
-          has been written out.
-        - Fix stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find() in the
-          error handling code path that resulted in a BUG() due to trying to
-          unmap an extent mft record when the mapping of it had failed and it
-          thus was not mapped.  (Thanks to Ken MacFerrin for the bug report.)
-        - Drop the runlist lock after the vcn has been read in
-          fs/ntfs/lcnalloc.c::__ntfs_cluster_free().
-        - Rewrite handling of multi sector transfer errors.  We now do not set
-          PageError() when such errors are detected in the async i/o handler
-          fs/ntfs/aops.c::ntfs_end_buffer_async_read().  All users of mst
-          protected attributes now check the magic of each ntfs record as they
-          use it and act appropriately.  This has the effect of making errors
-          granular per ntfs record rather than per page which solves the case
-          where we cannot access any of the ntfs records in a page when a
-          single one of them had an mst error.  (Thanks to Ken MacFerrin for
-          the bug report.)
-        - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date()
-          where we failed to release i_mutex on the $Quota/$Q attribute inode.
-        - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup().
-        - Add mapping of unmapped buffers to all remaining code paths, i.e.
-          fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(),
-          and write_mft_record_nolock().  From now on we require that the
-          complete runlist for the mft mirror is always mapped into memory.
-        - Add creation of buffers to fs/ntfs/mft.c::ntfs_sync_mft_mirror().
-        - Improve error handling in fs/ntfs/aops.c::ntfs_{read,write}_block().
-        - Cleanup fs/ntfs/aops.c::ntfs_{read,write}page() since we know that a
-          resident attribute will be smaller than a page which makes the code
-          simpler.  Also make the code more tolerant to concurrent ->truncate.
-2.1.21 - Fix some races and bugs, rewrite mft write code, add mft allocator.
-        - Implement extent mft record deallocation
-          fs/ntfs/mft.c::ntfs_extent_mft_record_free().
-        - Splitt runlist related functions off from attrib.[hc] to runlist.[hc].
-        - Add vol->mft_data_pos and initialize it at mount time.
-        - Rename init_runlist() to ntfs_init_runlist(), ntfs_vcn_to_lcn() to
-          ntfs_rl_vcn_to_lcn(), decompress_mapping_pairs() to
-          ntfs_mapping_pairs_decompress(), ntfs_merge_runlists() to
-          ntfs_runlists_merge() and adapt all callers.
-        - Add fs/ntfs/runlist.[hc]::ntfs_get_nr_significant_bytes(),
-          ntfs_get_size_for_mapping_pairs(), ntfs_write_significant_bytes(),
-          and ntfs_mapping_pairs_build(), adapted from libntfs.
-        - Make fs/ntfs/lcnalloc.c::ntfs_cluster_free_from_rl_nolock() not
-          static and add a declaration for it to lcnalloc.h.
-        - Add fs/ntfs/lcnalloc.h::ntfs_cluster_free_from_rl() which is a static
-          inline wrapper for ntfs_cluster_free_from_rl_nolock() which takes the
-          cluster bitmap lock for the duration of the call.
-        - Add fs/ntfs/attrib.[hc]::ntfs_attr_record_resize().
-        - Implement the equivalent of memset() for an ntfs attribute in
-          fs/ntfs/attrib.[hc]::ntfs_attr_set() and switch
-          fs/ntfs/logfile.c::ntfs_empty_logfile() to using it.
-        - Remove unnecessary casts from LCN_* constants.
-        - Implement fs/ntfs/runlist.c::ntfs_rl_truncate_nolock().
-        - Add MFT_RECORD_OLD as a copy of MFT_RECORD in fs/ntfs/layout.h and
-          change MFT_RECORD to contain the NTFS 3.1+ specific fields.
-        - Add a helper function fs/ntfs/aops.c::mark_ntfs_record_dirty() which
-          marks all buffers belonging to an ntfs record dirty, followed by
-          marking the page the ntfs record is in dirty and also marking the vfs
-          inode containing the ntfs record dirty (I_DIRTY_PAGES).
-        - Switch fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to using the
-          new helper fs/ntfs/aops.c::mark_ntfs_record_dirty() and remove the no
-          longer needed fs/ntfs/index.[hc]::__ntfs_index_entry_mark_dirty().
-        - Move ntfs_{un,}map_page() from ntfs.h to aops.h and fix resulting
-          include errors.
-        - Move the typedefs for runlist_element and runlist from types.h to
-          runlist.h and fix resulting include errors.
-        - Remove unused {__,}format_mft_record() from fs/ntfs/mft.c.
-        - Modify fs/ntfs/mft.c::__mark_mft_record_dirty() to use the helper
-          mark_ntfs_record_dirty() which also changes the behaviour in that we
-          now set the buffers belonging to the mft record dirty as well as the
-          page itself.
-        - Update fs/ntfs/mft.c::write_mft_record_nolock() and sync_mft_mirror()
-          to cope with the fact that there now are dirty buffers in mft pages.
-        - Update fs/ntfs/inode.c::ntfs_write_inode() to also use the helper
-          mark_ntfs_record_dirty() and thus to set the buffers belonging to the
-          mft record dirty as well as the page itself.
-        - Fix compiler warnings on x86-64 in fs/ntfs/dir.c.  (Randy Dunlap,
-          slightly modified by me)
-        - Add fs/ntfs/mft.c::try_map_mft_record() which fails with -EALREADY if
-          the mft record is already locked and otherwise behaves the same way
-          as fs/ntfs/mft.c::map_mft_record().
-        - Modify fs/ntfs/mft.c::write_mft_record_nolock() so that it only
-          writes the mft record if the buffers belonging to it are dirty.
-          Otherwise we assume that it was written out by other means already.
-        - Attempting to write outside initialized size is _not_ a bug so remove
-          the bug check from fs/ntfs/aops.c::ntfs_write_mst_block().  It is in
-          fact required to write outside initialized size when preparing to
-          extend the initialized size.
-        - Map the page instead of using page_address() before writing to it in
-          fs/ntfs/aops.c::ntfs_mft_writepage().
-        - Provide exclusion between opening an inode / mapping an mft record
-          and accessing the mft record in fs/ntfs/mft.c::ntfs_mft_writepage()
-          by setting the page not uptodate throughout ntfs_mft_writepage().
-        - Clear the page uptodate flag in fs/ntfs/aops.c::ntfs_write_mst_block()
-          to ensure noone can see the page whilst the mst fixups are applied.
-        - Add the helper fs/ntfs/mft.c::ntfs_may_write_mft_record() which
-          checks if an mft record may be written out safely obtaining any
-          necessary locks in the process.  This is used by
-          fs/ntfs/aops.c::ntfs_write_mst_block().
-        - Modify fs/ntfs/aops.c::ntfs_write_mst_block() to also work for
-          writing mft records and improve its error handling in the process.
-          Now if any of the records in the page fail to be written out, all
-          other records will be written out instead of aborting completely.
-        - Remove ntfs_mft_aops and update all users to use ntfs_mst_aops.
-        - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to set the
-          ntfs_mst_aops for all inodes which are NInoMstProtected() and
-          ntfs_aops for all other inodes.
-        - Rename fs/ntfs/mft.c::sync_mft_mirror{,_umount}() to
-          ntfs_sync_mft_mirror{,_umount}() and change their parameters so they
-          no longer require an ntfs inode to be present.  Update all callers.
-        - Cleanup the error handling in fs/ntfs/mft.c::ntfs_sync_mft_mirror().
-        - Clear the page uptodate flag in fs/ntfs/mft.c::ntfs_sync_mft_mirror()
-          to ensure noone can see the page whilst the mst fixups are applied.
-        - Remove the no longer needed fs/ntfs/mft.c::ntfs_mft_writepage() and
-          fs/ntfs/mft.c::try_map_mft_record().
-        - Fix callers of fs/ntfs/aops.c::mark_ntfs_record_dirty() to call it
-          with the ntfs inode which contains the page rather than the ntfs
-          inode the mft record of which is in the page.
-        - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by moving the
-          index inode bitmap inode release code from there to
-          fs/ntfs/inode.c::ntfs_clear_big_inode().  (Thanks to Christoph
-          Hellwig for spotting this.)
-        - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by taking the
-          inode semaphore around the code that sets ni->itype.index.bmp_ino to
-          NULL and reorganize the code to optimize it a bit.  (Thanks to
-          Christoph Hellwig for spotting this.)
-        - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() to no longer take the
-          ntfs inode as a parameter as this is confusing and misleading and the
-          needed ntfs inode is available via NTFS_I(page->mapping->host).
-          Adapt all callers to this change.
-        - Modify fs/ntfs/mft.c::write_mft_record_nolock() and
-          fs/ntfs/aops.c::ntfs_write_mst_block() to only check the dirty state
-          of the first buffer in a record and to take this as the ntfs record
-          dirty state.  We cannot look at the dirty state for subsequent
-          buffers because we might be racing with
-          fs/ntfs/aops.c::mark_ntfs_record_dirty().
-        - Move the static inline ntfs_init_big_inode() from fs/ntfs/inode.c to
-          inode.h and make fs/ntfs/inode.c::__ntfs_init_inode() non-static and
-          add a declaration for it to inode.h.  Fix some compilation issues
-          that resulted due to #includes and header file interdependencies.
-        - Simplify setup of i_mode in fs/ntfs/inode.c::ntfs_read_locked_inode().
-        - Add helpers fs/ntfs/layout.h::MK_MREF() and MK_LE_MREF().
-        - Modify fs/ntfs/mft.c::map_extent_mft_record() to only verify the mft
-          record sequence number if it is specified (i.e. not zero).
-        - Add fs/ntfs/mft.[hc]::ntfs_mft_record_alloc() and various helper
-          functions used by it.
-        - Update Documentation/filesystems/ntfs.txt with instructions on how to
-          use the Device-Mapper driver with NTFS ftdisk/LDM raid.  This removes
-          the linear raid problem with the Software RAID / MD driver when one
-          or more of the devices has an odd number of sectors.
-2.1.20 - Fix two stupid bugs introduced in 2.1.18 release.
-        - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_reinit_search_ctx()
-          where we did not clear ctx->al_entry but it was still set due to
-          changes in ntfs_attr_lookup() and ntfs_external_attr_find() in
-          particular.
-        - Fix another stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find()
-          where we forgot to unmap the extent mft record when we had finished
-          enumerating an attribute which caused a bug check to trigger when the
-          VFS calls ->clear_inode.
-2.1.19 - Many cleanups, improvements, and a minor bug fix.
-        - Update ->setattr (fs/ntfs/inode.c::ntfs_setattr()) to refuse to
-          change the uid, gid, and mode of an inode as we do not support NTFS
-          ACLs yet.
-        - Remove BKL use from ntfs_setattr() syncing up with the rest of the
-          kernel.
-        - Get rid of the ugly transparent union in fs/ntfs/dir.c::ntfs_readdir()
-          and ntfs_filldir() as per suggestion from Al Viro.
-        - Change '\0' and L'\0' to simply 0 as per advice from Linus Torvalds.
-        - Update ->truncate (fs/ntfs/inode.c::ntfs_truncate()) to check if the
-          inode size has changed and to only output an error if so.
-        - Rename fs/ntfs/attrib.h::attribute_value_length() to ntfs_attr_size().
-        - Add le{16,32,64} as well as sle{16,32,64} data types to
-          fs/ntfs/types.h.
-        - Change ntfschar to be le16 instead of u16 in fs/ntfs/types.h.
-        - Add le versions of VCN, LCN, and LSN called leVCN, leLCN, and leLSN,
-          respectively, to fs/ntfs/types.h.
-        - Update endianness conversion macros in fs/ntfs/endian.h to use the
-          new types as appropriate.
-        - Do proper type casting when using sle64_to_cpup() in fs/ntfs/dir.c
-          and index.c.
-        - Add leMFT_REF data type to fs/ntfs/layout.h.
-        - Update all NTFS header files with the new little endian data types.
-          Affected files are fs/ntfs/layout.h, logfile.h, and time.h.
-        - Do proper type casting when using ntfs_is_*_recordp() in
-          fs/ntfs/logfile.c, mft.c, and super.c. 
-        - Fix all the sparse bitwise warnings.  Had to change all the typedef
-          enums storing little endian values to simple enums plus a typedef for
-          the datatype to make sparse happy.
-        - Fix a bug found by the new sparse bitwise warnings where the default
-          upcase table was defined as a pointer to wchar_t rather than ntfschar
-          in fs/ntfs/ntfs.h and super.c.
-        - Change {const_,}cpu_to_le{16,32}(0) to just 0 as suggested by Al Viro.
-2.1.18 - Fix scheduling latencies at mount time as well as an endianness bug.
-        - Remove vol->nr_mft_records as it was pretty meaningless and optimize
-          the calculation of total/free inodes as used by statfs().
-        - Fix scheduling latencies in ntfs_fill_super() by dropping the BKL
-          because the code itself is using the ntfs_lock semaphore which
-          provides safe locking.  (Ingo Molnar)
-        - Fix a potential bug in fs/ntfs/mft.c::map_extent_mft_record() that
-          could occur in the future for when we start closing/freeing extent
-          inodes if we don't set base_ni->ext.extent_ntfs_inos to NULL after
-          we free it.
-        - Rename {find,lookup}_attr() to ntfs_attr_{find,lookup}() as well as
-          find_external_attr() to ntfs_external_attr_find() to cleanup the
-          namespace a bit and to be more consistent with libntfs.
-        - Rename {{re,}init,get,put}_attr_search_ctx() to
-          ntfs_attr_{{re,}init,get,put}_search_ctx() as well as the type
-          attr_search_context to ntfs_attr_search_ctx.
-        - Force use of ntfs_attr_find() in ntfs_attr_lookup() when searching
-          for the attribute list attribute itself.
-        - Fix endianness bug in ntfs_external_attr_find().
-        - Change ntfs_{external_,}attr_find() to return 0 on success, -ENOENT
-          if the attribute is not found, and -EIO on real error.  In the case
-          of -ENOENT, the search context is updated to describe the attribute
-          before which the attribute being searched for would need to be
-          inserted if such an action were to be desired and in the case of
-          ntfs_external_attr_find() the search context is also updated to
-          indicate the attribute list entry before which the attribute list
-          entry of the attribute being searched for would need to be inserted
-          if such an action were to be desired.  Also make ntfs_find_attr()
-          static and remove its prototype from attrib.h as it is not used
-          anywhere other than attrib.c.  Update ntfs_attr_lookup() and all
-          callers of ntfs_{external,}attr_{find,lookup}() for the new return
-          values.
-        - Minor cleanup of fs/ntfs/inode.c::ntfs_init_locked_inode().
-2.1.17 - Fix bugs in mount time error code paths and other updates.
-        - Implement bitmap modification code (fs/ntfs/bitmap.[hc]).  This
-          includes functions to set/clear a single bit or a run of bits.
-        - Add fs/ntfs/attrib.[hc]::ntfs_find_vcn() which returns the locked
-          runlist element containing a particular vcn.  It also takes care of
-          mapping any needed runlist fragments.
-        - Implement cluster (de-)allocation code (fs/ntfs/lcnalloc.[hc]).
-        - Load attribute definition table from $AttrDef at mount time.
-        - Fix bugs in mount time error code paths involving (de)allocation of
-          the default and volume upcase tables.
-        - Remove ntfs_nr_mounts as it is no longer used.
-2.1.16 - Implement access time updates, file sync, async io, and read/writev.
-        - Add support for readv/writev and aio_read/aio_write (fs/ntfs/file.c).
-          This is done by setting the appropriate file operations pointers to
-          the generic helper functions provided by mm/filemap.c.
-        - Implement fsync, fdatasync, and msync both for files (fs/ntfs/file.c)
-          and directories (fs/ntfs/dir.c).
-        - Add support for {a,m,c}time updates to inode.c::ntfs_write_inode().
-          Note, except for the root directory and any other system files opened
-          by the user, the system files will not have their access times
-          updated as they are only accessed at the inode level an hence the
-          file level functions which cause the times to be updated are never
-          invoked.
-2.1.15 - Invalidate quotas when (re)mounting read-write.
-        - Add new element itype.index.collation_rule to the ntfs inode
-          structure and set it appropriately in ntfs_read_locked_inode().
-        - Implement a new inode type "index" to allow efficient access to the
-          indices found in various system files and adapt inode handling
-          accordingly (fs/ntfs/inode.[hc]).  An index inode is essentially an
-          attribute inode (NInoAttr() is true) with an attribute type of
-          AT_INDEX_ALLOCATION.  As such, it is no longer allowed to call
-          ntfs_attr_iget() with an attribute type of AT_INDEX_ALLOCATION as
-          there would be no way to distinguish between normal attribute inodes
-          and index inodes.  The function to obtain an index inode is
-          ntfs_index_iget() and it uses the helper function
-          ntfs_read_locked_index_inode().  Note, we do not overload
-          ntfs_attr_iget() as indices consist of multiple attributes so using
-          ntfs_attr_iget() to obtain an index inode would be confusing.
-        - Ensure that there is no overflow when doing page->index <<
-          PAGE_CACHE_SHIFT by casting page->index to s64 in fs/ntfs/aops.c.
-        - Use atomic kmap instead of kmap() in fs/ntfs/aops.c::ntfs_read_page()
-          and ntfs_read_block().
-        - Use case sensitive attribute lookups instead of case insensitive ones.
-        - Lock all page cache pages belonging to mst protected attributes while
-          accessing them to ensure we never see corrupt data while the page is
-          under writeout.
-        - Add framework for generic ntfs collation (fs/ntfs/collation.[hc]).
-          We have ntfs_is_collation_rule_supported() to check if the collation
-          rule you want to use is supported and ntfs_collation() which actually
-          collates two data items.  We currently only support COLLATION_BINARY
-          and COLLATION_NTOFS_ULONG but support for other collation rules will
-          be added as the need arises.
-        - Add a new type, ntfs_index_context, to allow retrieval of an index
-          entry using the corresponding index key.  To get an index context,
-          use ntfs_index_ctx_get() and to release it, use ntfs_index_ctx_put().
-          This also adds a new slab cache for the index contexts.  To lookup a
-          key in an index inode, use ntfs_index_lookup().  After modifying an
-          index entry, call ntfs_index_entry_flush_dcache_page() followed by
-          ntfs_index_entry_mark_dirty() to ensure the changes are written out
-          to disk.  For details see fs/ntfs/index.[hc].  Note, at present, if
-          an index entry is in the index allocation attribute rather than the
-          index root attribute it will not be written out (you will get a
-          warning message about discarded changes instead).
-        - Load the quota file ($Quota) and check if quota tracking is enabled
-          and if so, mark the quotas out of date.  This causes windows to
-          rescan the volume on boot and update all quota entries.
-        - Add a set_page_dirty address space operation for ntfs_m[fs]t_aops.
-          It is simply set to __set_page_dirty_nobuffers() to make sure that
-          running set_page_dirty() on a page containing mft/ntfs records will
-          not affect the dirty state of the page buffers.
-        - Add fs/ntfs/index.c::__ntfs_index_entry_mark_dirty() which sets all
-          buffers that are inside the ntfs record in the page dirty after which
-          it sets the page dirty.  This allows ->writepage to only write the
-          dirty index records rather than having to write all the records in
-          the page.  Modify fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to
-          use this rather than __set_page_dirty_nobuffers().
-        - Implement fs/ntfs/aops.c::ntfs_write_mst_block() which enables the
-          writing of page cache pages belonging to mst protected attributes
-          like the index allocation attribute in directory indices and other
-          indices like $Quota/$Q, etc.  This means that the quota is now marked
-          out of date on all volumes rather than only on ones where the quota
-          defaults entry is in the index root attribute of the $Quota/$Q index.
-2.1.14 - Fix an NFSd caused deadlock reported by several users.
-        - Modify fs/ntfs/ntfs_readdir() to copy the index root attribute value
-          to a buffer so that we can put the search context and unmap the mft
-          record before calling the filldir() callback.  We need to do this
-          because of NFSd which calls ->lookup() from its filldir callback()
-          and this causes NTFS to deadlock as ntfs_lookup() maps the mft record
-          of the directory and since ntfs_readdir() has got it mapped already
-          ntfs_lookup() deadlocks.
-2.1.13 - Enable overwriting of resident files and housekeeping of system files.
-        - Implement writing of mft records (fs/ntfs/mft.[hc]), which includes
-          keeping the mft mirror in sync with the mft when mirrored mft records
-          are written.  The functions are write_mft_record{,_nolock}().  The
-          implementation is quite rudimentary for now with lots of things not
-          implemented yet but I am not sure any of them can actually occur so
-          I will wait for people to hit each one and only then implement it.
-        - Commit open system inodes at umount time.  This should make it
-          virtually impossible for sync_mft_mirror_umount() to ever be needed.
-        - Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the
-          ntfs super operations.  This gives us inode writing via the VFS inode
-          dirty code paths.  Note:  Access time updates are not implemented yet.
-        - Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make
-          fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus
-          finally enabling resident file overwrite!  (-8  This also includes a
-          placeholder for ->writepage (ntfs_mft_writepage()), which for now
-          just redirties the page and returns.  Also, at umount time, we for
-          now throw away all mft data page cache pages after the last call to
-          ntfs_commit_inode() in the hope that all inodes will have been
-          written out by then and hence no dirty (meta)data will be lost.  We
-          also check for this case and emit an error message telling the user
-          to run chkdsk.
-        - Use set_page_writeback() and end_page_writeback() in the resident
-          attribute code path of fs/ntfs/aops.c::ntfs_writepage() otherwise
-          the radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the
-          page is clean.
-        - Implement ntfs_mft_writepage() so it now checks if any of the mft
-          records in the page are dirty and if so redirties the page and
-          returns.  Otherwise it just returns (after doing set_page_writeback(),
-          unlock_page(), end_page_writeback() or the radix-tree tag
-          PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus
-          alowing the VM to do with the page as it pleases.  Also, at umount
-          time, now only throw away dirty mft (meta)data pages if dirty inodes
-          are present and ask the user to email us if they see this happening.
-        - Add functions ntfs_{clear,set}_volume_flags(), to modify the volume
-          information flags (fs/ntfs/super.c).
-        - Mark the volume dirty when (re)mounting read-write and mark it clean
-          when unmounting or remounting read-only.  If any volume errors are
-          found, the volume is left marked dirty to force chkdsk to run.
-        - Add code to set the NT4 compatibility flag when (re)mounting
-          read-write for newer NTFS versions but leave it commented out for now
-          since we do not make any modifications that are NTFS 1.2 specific yet
-          and since setting this flag breaks Captive-NTFS which is not nice.
-          This code must be enabled once we start writing NTFS 1.2 specific
-          changes otherwise Windows NTFS driver might crash / cause corruption.
-2.1.12 - Fix the second fix to the decompression engine and some cleanups.
-        - Add a new address space operations struct, ntfs_mst_aops, for mst
-          protected attributes.  This is because the default ntfs_aops do not
-          make sense with mst protected data and were they to write anything to
-          such an attribute they would cause data corruption so we provide
-          ntfs_mst_aops which does not have any write related operations set.
-        - Cleanup dirty ntfs inode handling (fs/ntfs/inode.[hc]) which also
-          includes an adapted ntfs_commit_inode() and an implementation of
-          ntfs_write_inode() which for now just cleans dirty inodes without
-          writing them (it does emit a warning that this is happening).
-        - Undo the second decompression engine fix (see 2.1.9 release ChangeLog
-          entry) as it was only fixing a theoretical bug but at the same time
-          it badly broke the handling of sparse and uncompressed compression
-          blocks.
-2.1.11 - Driver internal cleanups.
-        - Only build logfile.o if building the driver with read-write support.
-        - Really final white space cleanups.
-        - Use generic_ffs() instead of ffs() in logfile.c which allows the
-          log_page_size variable to be optimized by gcc into a constant.
-        - Rename uchar_t to ntfschar everywhere as uchar_t is unsigned 1-byte
-          char as defined by POSIX and as found on some systems.
-2.1.10 - Force read-only (re)mounting of volumes with unsupported volume flags.
-        - Finish off the white space cleanups (remove trailing spaces, etc).
-        - Clean up ntfs_fill_super() and ntfs_read_inode_mount() by removing
-          the kludges around the first iget().  Instead of (re)setting ->s_op
-          we have the $MFT inode set up by explicit new_inode() / set ->i_ino /
-          insert_inode_hash() / call ntfs_read_inode_mount() directly.  This
-          kills the need for second super_operations and allows to return error
-          from ntfs_read_inode_mount() without resorting to ugly "poisoning"
-          tricks.  (Al Viro)
-        - Force read-only (re)mounting if any of the following bits are set in
-          the volume information flags:
-                VOLUME_IS_DIRTY, VOLUME_RESIZE_LOG_FILE,
-                VOLUME_UPGRADE_ON_MOUNT, VOLUME_DELETE_USN_UNDERWAY,
-                VOLUME_REPAIR_OBJECT_ID, VOLUME_MODIFIED_BY_CHKDSK
-          To make this easier we define VOLUME_MUST_MOUNT_RO_MASK with all the
-          above bits set so the test is made easy.
-2.1.9 - Fix two bugs in decompression engine.
-        - Fix a bug where we would not always detect that we have reached the
-          end of a compression block because we were ending at minus one byte
-          which is effectively the same as being at the end.  The fix is to
-          check whether the uncompressed buffer has been fully filled and if so
-          we assume we have reached the end of the compression block.  A big
-          thank you to Marcin Gibuła for the bug report, the assistance in
-          tracking down the bug and testing the fix.
-        - Fix a possible bug where when a compressed read is truncated to the
-          end of the file, the offset inside the last page was not truncated.
-2.1.8 - Handle $MFT mirror and $LogFile, improve time handling, and cleanups.
-        - Use get_bh() instead of manual atomic_inc() in fs/ntfs/compress.c.
-        - Modify fs/ntfs/time.c::ntfs2utc(), get_current_ntfs_time(), and
-          utc2ntfs() to work with struct timespec instead of time_t on the
-          Linux UTC time side thus preserving the full precision of the NTFS
-          time and only loosing up to 99 nano-seconds in the Linux UTC time.
-        - Move fs/ntfs/time.c to fs/ntfs/time.h and make the time functions
-          static inline.
-        - Remove unused ntfs_dirty_inode().
-        - Cleanup super operations declaration in fs/ntfs/super.c.
-        - Wrap flush_dcache_mft_record_page() in #ifdef NTFS_RW.
-        - Add NInoTestSetFoo() and NInoTestClearFoo() macro magic to
-          fs/ntfs/inode.h and use it to declare NInoTest{Set,Clear}Dirty.
-        - Move typedefs for ntfs_attr and test_t from fs/ntfs/inode.c to
-          fs/ntfs/inode.h so they can be used elsewhere.
-        - Determine the mft mirror size as the number of mirrored mft records
-          and store it in ntfs_volume->mftmirr_size (fs/ntfs/super.c).
-        - Load the mft mirror at mount time and compare the mft records stored
-          in it to the ones in the mft.  Force a read-only mount if the two do
-          not match (fs/ntfs/super.c).
-        - Fix type casting related warnings on 64-bit architectures.  Thanks
-          to Meelis Roos for reporting them.
-        - Move %L to %ll as %L is floating point and %ll is integer which is
-          what we want.
-        - Read the journal ($LogFile) and determine if the volume has been
-          shutdown cleanly and force a read-only mount if not (fs/ntfs/super.c
-          and fs/ntfs/logfile.c).  This is a little bit of a crude check in
-          that we only look at the restart areas and not at the actual log
-          records so that there will be a very small number of cases where we
-          think that a volume is dirty when in fact it is clean.  This should
-          only affect volumes that have not been shutdown cleanly and did not
-          have any pending, non-check-pointed i/o.
-        - If the $LogFile indicates a clean shutdown and a read-write (re)mount
-          is requested, empty $LogFile by overwriting it with 0xff bytes to
-          ensure that Windows cannot cause data corruption by replaying a stale
-          journal after Linux has written to the volume.
-2.1.7 - Enable NFS exporting of mounted NTFS volumes.
-        - Set i_generation in the VFS inode from the seq_no of the NTFS inode.
-        - Make ntfs_lookup() NFS export safe, i.e. use d_splice_alias(), etc.
-        - Implement ->get_dentry() in fs/ntfs/namei.c::ntfs_get_dentry() as the
-          default doesn't allow inode number 0 which is a valid inode on NTFS
-          and even if it did allow that it uses iget() instead of ntfs_iget()
-          which makes it useless for us.
-        - Implement ->get_parent() in fs/ntfs/namei.c::ntfs_get_parent() as the
-          default just returns -EACCES which is not very useful.
-        - Define export operations (->s_export_op) for NTFS (ntfs_export_ops)
-          and set them up in the super block at mount time (super.c) this
-          allows mounted NTFS volumes to be exported via NFS.
-        - Add missing return -EOPNOTSUPP; in
-          fs/ntfs/aops.c::ntfs_commit_nonresident_write().
-        - Enforce no atime and no dir atime updates at mount/remount time as
-          they are not implemented yet anyway.
-        - Move a few assignments in fs/ntfs/attrib.c::load_attribute_list() to
-          after a NULL check.  Thanks to Dave Jones for pointing this out.
-2.1.6 - Fix minor bug in handling of compressed directories.
-        - Fix bug in handling of compressed directories.  A compressed
-          directory is not really compressed so when we set the ->i_blocks
-          field of a compressed directory inode we were setting it from the
-          non-existing field ni->itype.compressed.size which gave random
-          results...  For directories we now always use ni->allocated_size.
-2.1.5 - Fix minor bug in attribute list attribute handling.
-        - Fix bug in attribute list handling.  Actually it is not as much a bug
-          as too much protection in that we were not allowing attribute lists
-          which waste space on disk while Windows XP clearly allows it and in
-          fact creates such attribute lists so our driver was failing.
-        - Update NTFS documentation ready for 2.6 kernel release.
-2.1.4 - Reduce compiler requirements.
-        - Remove all uses of unnamed structs and unions in the driver to make
-          old and newer gcc versions happy. Makes it a bit uglier IMO but at
-          least people will stop hassling me about it.
-2.1.3 - Important bug fixes in corner cases.
-        - super.c::parse_ntfs_boot_sector(): Correct the check for 64-bit
-          clusters. (Philipp Thomas)
-        - attrib.c::load_attribute_list(): Fix bug when initialized_size is a
-          multiple of the block_size but not the cluster size. (Szabolcs
-          Szakacsits)
-2.1.2 - Important bug fixes aleviating the hangs in statfs.
-        - Fix buggy free cluster and free inode determination logic.
-2.1.1 - Minor updates.
-        - Add handling for initialized_size != data_size in compressed files.
-        - Reduce function local stack usage from 0x3d4 bytes to just noise in
-          fs/ntfs/upcase.c. (Randy Dunlap)
-        - Remove compiler warnings for newer gcc.
-        - Pages are no longer kmapped by mm/filemap.c::generic_file_write()
-          around calls to ->{prepare,commit}_write.  Adapt NTFS appropriately
-          in fs/ntfs/aops.c::ntfs_prepare_nonresident_write() by using
-          kmap_atomic(KM_USER0).
-2.1.0 - First steps towards write support: implement file overwrite.
-        - Add configuration option for developmental write support with an
-          appropriately scary configuration help text.
-        - Initial implementation of fs/ntfs/aops.c::ntfs_writepage() and its
-          helper fs/ntfs/aops.c::ntfs_write_block(). This enables mmap(2) based
-          overwriting of existing files on ntfs. Note: Resident files are
-          only written into memory, and not written out to disk at present, so
-          avoid writing to files smaller than about 1kiB.
-        - Initial implementation of fs/ntfs/aops.c::ntfs_prepare_write(), its
-          helper fs/ntfs/aops.c::ntfs_prepare_nonresident_write() and their
-          counterparts, fs/ntfs/aops.c::ntfs_commit_write(), and
-          fs/ntfs/aops.c::ntfs_commit_nonresident_write(), respectively. Also,
-          add generic_file_write() to the ntfs file operations (fs/ntfs/file.c).
-          This enables write(2) based overwriting of existing files on ntfs.
-          Note: As with mmap(2) based overwriting, resident files are only
-          written into memory, and not written out to disk at present, so avoid
-          writing to files smaller than about 1kiB.
-        - Implement ->truncate (fs/ntfs/inode.c::ntfs_truncate()) and
-          ->setattr() (fs/ntfs/inode.c::ntfs_setattr()) inode operations for
-          files with the purpose of intercepting and aborting all i_size
-          changes which we do not support yet. ntfs_truncate() actually only
-          emits a warning message but AFAICS our interception of i_size changes
-          elsewhere means ntfs_truncate() never gets called for i_size changes.
-          It is only called from generic_file_write() when we fail in
-          ntfs_prepare_{,nonresident_}write() in order to discard any
-          instantiated buffers beyond i_size. Thus i_size is not actually
-          changed so our warning message is enough. Unfortunately it is not
-          possible to easily determine if i_size is being changed or not hence
-          we just emit an appropriately worded error message.
-2.0.25 - Small bug fixes and cleanups.
-        - Unlock the page in an out of memory error code path in
-          fs/ntfs/aops.c::ntfs_read_block().
-        - If fs/ntfs/aops.c::ntfs_read_page() is called on an uptodate page,
-          just unlock the page and return. (This can happen due to ->writepage
-          clearing PageUptodate() during write out of MstProtected()
-          attributes.
-        - Remove leaked write code again.
-2.0.24 - Cleanups.
-        - Treat BUG_ON() as ASSERT() not VERIFY(), i.e. do not use side effects
-          inside BUG_ON(). (Adam J. Richter)
-        - Split logical OR expressions inside BUG_ON() into individual BUG_ON()
-          calls for improved debugging. (Adam J. Richter)
-        - Add errors flag to the ntfs volume state, accessed via
-          NVol{,Set,Clear}Errors(vol).
-        - Do not allow read-write remounts of read-only volumes with errors.
-        - Clarify comment for ntfs file operation sendfile which was added by
-          Christoph Hellwig a while ago (just using generic_file_sendfile())
-          to say that ntfs ->sendfile is only used for the case where the
-          source data is on the ntfs partition and the destination is
-          somewhere else, i.e. nothing we need to concern ourselves with.
-        - Add generic_file_write() as our ntfs file write operation.
-2.0.23 - Major bug fixes (races, deadlocks, non-i386 architectures).
-        - Massive internal locking changes to mft record locking. Fixes lock
-          recursion and replaces the mrec_lock read/write semaphore with a
-          mutex. Also removes the now superfluous mft_count. This fixes several
-          race conditions and deadlocks, especially in the future write code.
-        - Fix ntfs over loopback for compressed files by adding an
-          optimization barrier. (gcc was screwing up otherwise ?)
-        - Miscellaneous cleanups all over the code and a fix or two in error
-          handling code paths.
-        Thanks go to Christoph Hellwig for pointing out the following two:
-        - Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
-        - Fix ntfs_free() for ia64 and parisc by checking for VMALLOC_END, too.
-2.0.22 - Cleanups, mainly to ntfs_readdir(), and use C99 initializers.
-        - Change fs/ntfs/dir.c::ntfs_reddir() to only read/write ->f_pos once
-          at entry/exit respectively.
-        - Use C99 initializers for structures.
-        - Remove unused variable blocks from fs/ntfs/aops.c::ntfs_read_block().
-2.0.21 - Check for, and refuse to work with too large files/directories/volumes.
-        - Limit volume size at mount time to 2TiB on architectures where
-          unsigned long is 32-bits (fs/ntfs/super.c::parse_ntfs_boot_sector()).
-          This is the most we can do without overflowing the 32-bit limit of
-          the block device size imposed on us by sb_bread() and sb_getblk()
-          for the time being.
-        - Limit file/directory size at open() time to 16TiB on architectures
-          where unsigned long is 32-bits (fs/ntfs/file.c::ntfs_file_open() and
-          fs/ntfs/dir.c::ntfs_dir_open()). This is the most we can do without
-          overflowing the page cache page index.
-2.0.20 - Support non-resident directory index bitmaps, fix page leak in readdir.
-        - Move the directory index bitmap to use an attribute inode instead of
-          having special fields for it inside the ntfs inode structure. This
-          means that the index bitmaps now use the page cache for i/o, too,
-          and also as a side effect we get support for non-resident index
-          bitmaps for free.
-        - Simplify/cleanup error handling in fs/ntfs/dir.c::ntfs_readdir() and
-          fix a page leak that manifested itself in some cases.
-        - Add fs/ntfs/inode.c::ntfs_put_inode(), which we need to release the
-          index bitmap inode on the final iput().
-2.0.19 - Fix race condition, improvements, and optimizations in i/o interface.
-        - Apply block optimization added to fs/ntfs/aops.c::ntfs_read_block()
-          to fs/ntfs/compress.c::ntfs_file_read_compressed_block() as well.
-        - Drop the "file" from ntfs_file_read_compressed_block().
-        - Rename fs/ntfs/aops.c::ntfs_enb_buffer_read_async() to
-          ntfs_end_buffer_async_read() (more like the fs/buffer.c counterpart).
-        - Update ntfs_end_buffer_async_read() with the improved logic from
-          its updated counterpart fs/buffer.c::end_buffer_async_read(). Apply
-          further logic improvements to better determine when we set PageError.
-        - Update submission of buffers in fs/ntfs/aops.c::ntfs_read_block() to
-          check for the buffers being uptodate first in line with the updated
-          fs/buffer.c::block_read_full_page(). This plugs a small race
-          condition.
-2.0.18 - Fix race condition in reading of compressed files.
-        - There was a narrow window between checking a buffer head for being
-          uptodate and locking it in ntfs_file_read_compressed_block(). We now
-          lock the buffer and then check whether it is uptodate or not.
-2.0.17 - Cleanups and optimizations - shrinking the ToDo list.
-        - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to return an error
-          code and update callers, i.e. ntfs_iget(), to pass that error code
-          up instead of just using -EIO.
-        - Modifications to super.c to ensure that both mount and remount
-          cannot set any write related options when the driver is compiled
-          read-only.
-        - Optimize block resolution in fs/ntfs/aops.c::ntfs_read_block() to
-          cache the current runlist element. This should improve performance
-          when reading very large and/or very fragmented data.
-2.0.16 - Convert access to $MFT/$BITMAP to attribute inode API.
-        - Fix a stupid bug introduced in 2.0.15 where we were unmapping the
-          wrong inode in fs/ntfs/inode.c::ntfs_attr_iget().
-        - Fix debugging check in fs/ntfs/aops.c::ntfs_read_block().
-        - Convert $MFT/$BITMAP access to attribute inode API and remove all
-          remnants of the ugly mftbmp address space and operations hack. This
-          means we finally have only one readpage function as well as only one
-          async io completion handler. Yey! The mft bitmap is now just an
-          attribute inode and is accessed from vol->mftbmp_ino just as if it
-          were a normal file. Fake inodes rule. (-:
-2.0.15 - Fake inodes based attribute i/o via the pagecache, fixes and cleanups.
-        - Fix silly bug in fs/ntfs/super.c::parse_options() which was causing
-          remounts to fail when the partition had an entry in /etc/fstab and
-          the entry specified the nls= option.
-        - Apply same macro magic used in fs/ntfs/inode.h to fs/ntfs/volume.h to
-          expand all the helper functions NVolFoo(), NVolSetFoo(), and
-          NVolClearFoo().
-        - Move copyright statement from driver initialisation message to
-          module description (fs/super.c). This makes the initialisation
-          message fit on one line and fits in better with rest of kernel.
-        - Update fs/ntfs/attrib.c::map_run_list() to work on both real and
-          attribute inodes, and both for files and directories.
-        - Implement fake attribute inodes allowing all attribute i/o to go via
-          the page cache and to use all the normal vfs/mm functionality:
-          - Add ntfs_attr_iget() and its helper ntfs_read_locked_attr_inode()
-            to fs/ntfs/inode.c.
-          - Add needed cleanup code to ntfs_clear_big_inode().
-        - Merge address space operations for files and directories (aops.c),
-          now just have ntfs_aops:
-          - Rename:
-                end_buffer_read_attr_async() -> ntfs_end_buffer_read_async(),
-                ntfs_attr_read_block()       -> ntfs_read_block(),
-                ntfs_file_read_page()        -> ntfs_readpage().
-          - Rewrite fs/ntfs/aops.c::ntfs_readpage() to work on both real and
-            attribute inodes, and both for files and directories.
-          - Remove obsolete fs/ntfs/aops.c::ntfs_mst_readpage().
-2.0.14 - Run list merging code cleanup, minor locking changes, typo fixes.
-        - Change fs/ntfs/super.c::ntfs_statfs() to not rely on BKL by moving
-          the locking out of super.c::get_nr_free_mft_records() and taking and
-          dropping the mftbmp_lock rw_semaphore in ntfs_statfs() itself.
-        - Bring attribute runlist merging code (fs/ntfs/attrib.c) in sync with
-          current userspace ntfs library code. This means that if a merge
-          fails the original runlists are always left unmodified instead of
-          being silently corrupted.
-        - Misc typo fixes.
-2.0.13 - Use iget5_locked() in preparation for fake inodes and small cleanups.
-        - Remove nr_mft_bits and the now superfluous union with nr_mft_records
-          from ntfs_volume structure.
-        - Remove nr_lcn_bits and the now superfluous union with nr_clusters
-          from ntfs_volume structure.
-        - Use iget5_locked() and friends instead of conventional iget(). Wrap
-          the call in fs/ntfs/inode.c::ntfs_iget() and update callers of iget()
-          to use ntfs_iget(). Leave only one iget() call at mount time so we
-          don't need an ntfs_iget_mount().
-        - Change fs/ntfs/inode.c::ntfs_new_extent_inode() to take mft_no as an
-          additional argument.
-2.0.12 - Initial cleanup of address space operations following 2.0.11 changes.
-        - Merge fs/ntfs/aops.c::end_buffer_read_mst_async() and
-          fs/ntfs/aops.c::end_buffer_read_file_async() into one function
-          fs/ntfs/aops.c::end_buffer_read_attr_async() using NInoMstProtected()
-          to determine whether to apply mst fixups or not.
-        - Above change allows merging fs/ntfs/aops.c::ntfs_file_read_block()
-          and fs/ntfs/aops.c::ntfs_mst_readpage() into one function
-          fs/ntfs/aops.c::ntfs_attr_read_block(). Also, create a tiny wrapper
-          fs/ntfs/aops.c::ntfs_mst_readpage() to transform the parameters from
-          the VFS readpage function prototype to the ntfs_attr_read_block()
-          function prototype.
-2.0.11 - Initial preparations for fake inode based attribute i/o.
-        - Move definition of ntfs_inode_state_bits to fs/ntfs/inode.h and
-          do some macro magic (adapted from include/linux/buffer_head.h) to
-          expand all the helper functions NInoFoo(), NInoSetFoo(), and
-          NInoClearFoo().
-        - Add new flag to ntfs_inode_state_bits: NI_Sparse.
-        - Add new fields to ntfs_inode structure to allow use of fake inodes
-          for attribute i/o: type, name, name_len. Also add new state bits:
-          NI_Attr, which, if set, indicates the inode is a fake inode, and
-          NI_MstProtected, which, if set, indicates the attribute uses multi
-          sector transfer protection, i.e. fixups need to be applied after
-          reads and before/after writes.
-        - Rename fs/ntfs/inode.c::ntfs_{new,clear,destroy}_inode() to
-          ntfs_{new,clear,destroy}_extent_inode() and update callers.
-        - Use ntfs_clear_extent_inode() in fs/ntfs/inode.c::__ntfs_clear_inode()
-          instead of ntfs_destroy_extent_inode().
-        - Cleanup memory deallocations in {__,}ntfs_clear_{,big_}inode().
-        - Make all operations on ntfs inode state bits use the NIno* functions.
-        - Set up the new ntfs inode fields and state bits in
-          fs/ntfs/inode.c::ntfs_read_inode() and add appropriate cleanup of
-          allocated memory to __ntfs_clear_inode().
-        - Cleanup ntfs_inode structure a bit for better ordering of elements
-          w.r.t. their size to allow better packing of the structure in memory.
-2.0.10 - There can only be 2^32 - 1 inodes on an NTFS volume.
-        - Add check at mount time to verify that the number of inodes on the
-          volume does not exceed 2^32 - 1, which is the maximum allowed for
-          NTFS according to Microsoft.
-        - Change mft_no member of ntfs_inode structure to be unsigned long.
-          Update all users. This makes ntfs_inode->mft_no just a copy of struct
-          inode->i_ino. But we can't just always use struct inode->i_ino and
-          remove mft_no because extent inodes do not have an attached struct
-          inode.
-2.0.9 - Decompression engine now uses a single buffer and other cleanups.
-        - Change decompression engine to use a single buffer protected by a
-          spin lock instead of per-CPU buffers. (Rusty Russell)
-        - Do not update cb_pos when handling a partial final page during
-          decompression of a sparse compression block, as the value is later
-          reset without being read/used. (Rusty Russell)
-        - Switch to using the new KM_BIO_SRC_IRQ for atomic kmap()s. (Andrew
-          Morton)
-        - Change buffer size in ntfs_readdir()/ntfs_filldir() to use
-          NLS_MAX_CHARSET_SIZE which makes the buffers almost 1kiB each but
-          it also makes everything safer so it is a good thing.
-        - Miscellaneous minor cleanups to comments.
-2.0.8 - Major updates for handling of case sensitivity and dcache aliasing.
-        Big thanks go to Al Viro and other inhabitants of #kernel for investing
-        their time to discuss the case sensitivity and dcache aliasing issues.
-        - Remove unused source file fs/ntfs/attraops.c.
-        - Remove show_inodes mount option(s), thus dropping support for
-          displaying of short file names.
-        - Remove deprecated mount option posix.
-        - Restore show_sys_files mount option.
-        - Add new mount option case_sensitive, to determine if the driver
-          treats file names as case sensitive or not. If case sensitive, create
-          file names in the POSIX namespace. Otherwise create file names in the
-          LONG/WIN32 namespace. Note, files remain accessible via their short
-          file name, if it exists.
-        - Remove really dumb logic bug in boot sector recovery code.
-        - Fix dcache aliasing issues wrt short/long file names via changes
-          to fs/ntfs/dir.c::ntfs_lookup_inode_by_name() and
-          fs/ntfs/namei.c::ntfs_lookup():
-          - Add additional argument to ntfs_lookup_inode_by_name() in which we
-            return information about the matching file name if the case is not
-            matching or the match is a short file name. See comments above the
-            function definition for details.
-          - Change ntfs_lookup() to only create dcache entries for the correctly
-            cased file name and only for the WIN32 namespace counterpart of DOS
-            namespace file names. This ensures we have only one dentry per
-            directory and also removes all dcache aliasing issues between short
-            and long file names once we add write support. See comments above
-            function for details.
-        - Fix potential 1 byte overflow in fs/ntfs/unistr.c::ntfs_ucstonls().
-2.0.7 - Minor cleanups and updates for changes in core kernel code.
-        - Remove much of the NULL struct element initializers.
-        - Various updates to make compatible with recent kernels.
-        - Remove defines of MAX_BUF_PER_PAGE and include linux/buffer_head.h
-          in fs/ntfs/ntfs.h instead.
-        - Remove no longer needed KERNEL_VERSION checks. We are now in the
-          kernel proper so they are no longer needed.
-2.0.6 - Major bugfix to make compatible with other kernel changes.
-        - Initialize the mftbmp address space properly now that there are more
-          fields in the struct address_space. This was leading to hangs and
-          oopses on umount since 2.5.12 because of changes to other parts of
-          the kernel. We probably want a kernel generic init_address_space()
-          function...
-        - Drop BKL from ntfs_readdir() after consultation with Al Viro. The
-          only caller of ->readdir() is vfs_readdir() which holds i_mutex
-          during the call, and i_mutex is sufficient protection against changes
-          in the directory inode (including ->i_size).
-        - Use generic_file_llseek() for directories (as opposed to
-          default_llseek()) as this downs i_mutex instead of the BKL which is
-          what we now need for exclusion against ->f_pos changes considering we
-          no longer take the BKL in ntfs_readdir().
-2.0.5 - Major bugfix. Buffer overflow in extent inode handling.
-        - No need to set old blocksize in super.c::ntfs_fill_super() as the
-          VFS does so via invocation of deactivate_super() calling
-          fs->fill_super() calling block_kill_super() which does it.
-        - BKL moved from VFS into dir.c::ntfs_readdir(). (Linus Torvalds)
-          -> Do we really need it? I don't think so as we have exclusion on
-          the directory ntfs_inode rw_semaphore mrec_lock. We mmight have to
-          move the ->f_pos accesses under the mrec_lock though. Check this...
-        - Fix really, really, really stupid buffer overflow in extent inode
-          handling in mft.c::map_extent_mft_record().
-2.0.4 - Cleanups and updates for kernel 2.5.11.
-        - Add documentation on how to use the MD driver to be able to use NTFS
-          stripe and volume sets in Linux and generally cleanup documentation
-          a bit.
-        Remove all uses of kdev_t in favour of struct block_device *:
-        - Change compress.c::ntfs_file_read_compressed_block() to use
-          sb_getblk() instead of getblk().
-        - Change super.c::ntfs_fill_super() to use bdev_hardsect_size() instead
-          of get_hardsect_size().
-        - No need to get old blocksize in super.c::ntfs_fill_super() as
-          fs/super.c::get_sb_bdev() already does this.
-        - Set bh->b_bdev instead of bh->b_dev throughout aops.c.
-2.0.3 - Small bug fixes, cleanups, and performance improvements.
-        - Remove some dead code from mft.c.
-        - Optimize readpage and read_block functions throughout aops.c so that
-          only initialized blocks are read. Non-initialized ones have their
-          buffer head mapped, zeroed, and set up to date, without scheduling
-          any i/o. Thanks to Al Viro for advice on how to avoid the device i/o.
-        Thanks go to Andrew Morton for spotting the below:
-        - Fix buglet in allocate_compression_buffers() error code path.
-        - Call flush_dcache_page() after modifying page cache page contents in
-          ntfs_file_readpage().
-        - Check for existence of page buffers throughout aops.c before calling
-          create_empty_buffers(). This happens when an I/O error occurs and the
-          read is retried. (It also happens once writing is implemented so that
-          needed doing anyway but I had left it for later...)
-        - Don't BUG_ON() uptodate and/or mapped buffers throughout aops.c in
-          readpage and read_block functions. Reasoning same as above (i.e. I/O
-          error retries and future write code paths.)
-2.0.2 - Minor updates and cleanups.
-        - Cleanup: rename mst.c::__post_read_mst_fixup to post_write_mst_fixup
-          and cleanup the code a bit, removing the unused size parameter.
-        - Change default fmask to 0177 and update documentation.
-        - Change attrib.c::get_attr_search_ctx() to return the search context
-          directly instead of taking the address of a pointer. A return value
-          of NULL means the allocation failed. Updated all callers
-          appropriately.
-        - Update to 2.5.9 kernel (preserving backwards compatibility) by
-          replacing all occurences of page->buffers with page_buffers(page).
-        - Fix minor bugs in runlist merging, also minor cleanup.
-        - Updates to bootsector layout and mft mirror contents descriptions.
-        - Small bug fix in error detection in unistr.c and some cleanups.
-        - Grow name buffer allocations in unistr.c in aligned mutlipled of 64
-          bytes.
-2.0.1 - Minor updates.
-        - Make default umask correspond to documentation.
-        - Improve documentation.
-        - Set default mode to include execute bit. The {u,f,d}mask can be used
-          to take it away if desired. This allows binaries to be executed from
-          a mounted ntfs partition.
-2.0.0 - New version number. Remove TNG from the name. Now in the kernel.
-        - Add kill_super, just keeping up with the vfs changes in the kernel.
-        - Repeat some changes from tng-0.0.8 that somehow got lost on the way
-          from the CVS import into BitKeeper.
-        - Begin to implement proper handling of allocated_size vs
-          initialized_size vs data_size (i.e. i_size). Done are
-          mft.c::ntfs_mft_readpage(), aops.c::end_buffer_read_index_async(),
-          and attrib.c::load_attribute_list().
-        - Lock the runlist in attrib.c::load_attribute_list() while using it.
-        - Fix memory leak in ntfs_file_read_compressed_block() and generally
-          clean up compress.c a little, removing some uncommented/unused debug
-          code.
-        - Tidy up dir.c a little bit.
-        - Don't bother getting the runlist in inode.c::ntfs_read_inode().
-        - Merge mft.c::ntfs_mft_readpage() and aops.c::ntfs_index_readpage()
-          creating aops.c::ntfs_mst_readpage(), improving the handling of
-          holes and overflow in the process and implementing the correct
-          equivalent of ntfs_file_get_block() in ntfs_mst_readpage() itself.
-          I am aiming for correctness at the moment. Modularisation can come
-          later.
-        - Rename aops.c::end_buffer_read_index_async() to
-          end_buffer_read_mst_async() and optimize the overflow checking and
-          handling.
-        - Use the host of the mftbmp address space mapping to hold the ntfs
-          volume. This is needed so the async i/o completion handler can
-          retrieve a pointer to the volume. Hopefully this will not cause
-          problems elsewhere in the kernel... Otherwise will need to use a
-          fake inode.
-        - Complete implementation of proper handling of allocated_size vs
-          initialized_size vs data_size (i.e. i_size) in whole driver.
-          Basically aops.c is now completely rewritten.
-        - Change NTFS driver name to just NTFS and set version number to 2.0.0
-          to make a clear distinction from the old driver which is still on
-          version 1.1.22.
-tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/
-        - Replace bdevname(sb->s_dev) with sb->s_id.
-        - Remove now superfluous new-line characters in all callers of
-          ntfs_debug().
-        - Apply kludge in ntfs_read_inode(), setting i_nlink to 1 for
-          directories. Without this the "find" utility gets very upset which is
-          fair enough as Linux/Unix do not support directory hard links.
-        - Further runlist merging work. (Richard Russon)
-        - Backwards compatibility for gcc-2.95. (Richard Russon)
-        - Update to kernel 2.5.5-pre1 and rediff the now tiny patch.
-        - Convert to new filesystem declaration using ->ntfs_get_sb() and
-          replacing ntfs_read_super() with ntfs_fill_super().
-        - Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index
-          overflow on 32-bit architectures.
-        - Cleanup upcase loading code to use ntfs_(un)map_page().
-        - Disable/reenable preemtion in critical sections of compession engine.
-        - Replace device size determination in ntfs_fill_super() with
-          sb->s_bdev->bd_inode->i_size (in bytes) and remove now superfluous
-          function super.c::get_nr_blocks().
-        - Implement a mount time option (show_inodes) allowing choice of which
-          types of inode names readdir() returns and modify ntfs_filldir()
-          accordingly. There are several parameters to show_inodes:
-                system: system files
-                win32:  long file names (including POSIX file names) [DEFAULT]
-                long:   same as win32
-                dos:    short file names only (excluding POSIX file names)
-                short:  same as dos
-                posix:  same as both win32 and dos
-                all:    all file names
-          Note that the options are additive, i.e. specifying:
-                -o show_inodes=system,show_inodes=win32,show_inodes=dos
-          is the same as specifying:
-                -o show_inodes=all
-          Note that the "posix" and "all" options will show all directory
-          names, BUT the link count on each directory inode entry is set to 1,
-          due to Linux not supporting directory hard links. This may well
-          confuse some userspace applications, since the directory names will
-          have the same inode numbers. Thus it is NOT advisable to use the
-          "posix" or "all" options. We provide them only for completeness sake.
-        - Add copies of allocated_size, initialized_size, and compressed_size to
-          the ntfs inode structure and set them up in
-          inode.c::ntfs_read_inode(). These reflect the unnamed data attribute
-          for files and the index allocation attribute for directories.
-        - Add copies of allocated_size and initialized_size to ntfs inode for
-          $BITMAP attribute of large directories and set them up in
-          inode.c::ntfs_read_inode().
-        - Add copies of allocated_size and initialized_size to ntfs volume for
-          $BITMAP attribute of $MFT and set them up in
-          super.c::load_system_files().
-        - Parse deprecated ntfs driver options (iocharset, show_sys_files,
-          posix, and utf8) and tell user what the new options to use are. Note
-          we still do support them but they will be removed with kernel 2.7.x.
-        - Change all occurences of integer long long printf formatting to hex
-          as printk() will not support long long integer format if/when the
-          div64 patch goes into the kernel.
-        - Make slab caches have stable names and change the names to what they
-          were intended to be. These changes are required/made possible by the
-          new slab cache name handling which removes the length limitation by
-          requiring the caller of kmem_cache_create() to supply a stable name
-          which is then referenced but not copied.
-        - Rename run_list structure to run_list_element and create a new
-          run_list structure containing a pointer to a run_list_element
-          structure and a read/write semaphore. Adapt all users of runlists
-          to new scheme and take and release the lock as needed. This fixes a
-          nasty race as the run_list changes even when inodes are locked for
-          reading and even when the inode isn't locked at all, so we really
-          needed the serialization. We use a semaphore rather than a spinlock
-          as memory allocations can sleep and doing everything GFP_ATOMIC
-          would be silly.
-        - Cleanup read_inode() removing all code checking for lowest_vcn != 0.
-          This can never happen due to the nature of lookup_attr() and how we
-          support attribute lists. If it did happen it would imply the inode
-          being corrupt.
-        - Check for lowest_vcn != 0 in ntfs_read_inode() and mark the inode as
-          bad if found.
-        - Update to 2.5.6-pre2 changes in struct address_space.
-        - Use parent_ino() when accessing d_parent inode number in dir.c.
-        - Import Sourceforge CVS repository into BitKeeper repository:
-                http://linux-ntfs.bkbits.net/ntfs-tng-2.5
-        - Update fs/Makefile, fs/Config.help, fs/Config.in, and
-          Documentation/filesystems/ntfs.txt for NTFS TNG.
-        - Create kernel configuration option controlling whether debugging
-          is enabled or not.
-        - Add the required export of end_buffer_io_sync() from the patches
-          directory to the kernel code.
-        - Update inode.c::ntfs_show_options() with show_inodes mount option.
-        - Update errors mount option.
-tng-0.0.7 - 13/02/2002 - The driver is now feature complete for read-only!
-        - Cleanup mft.c and it's debug/error output in particular. Fix a minor
-          bug in mapping of extent inodes. Update all the comments to fit all
-          the recent code changes.
-        - Modify vcn_to_lcn() to cope with entirely unmapped runlists.
-        - Cleanups in compress.c, mostly comments and folding help.
-        - Implement attrib.c::map_run_list() as a generic helper.
-        - Make compress.c::ntfs_file_read_compressed_block() use map_run_list()
-          thus making code shorter and enabling attribute list support.
-        - Cleanup incorrect use of [su]64 with %L printf format specifier in
-          all source files. Type casts to [unsigned] long long added to correct
-          the mismatches (important for architectures which have long long not
-          being 64 bits).
-        - Merge async io completion handlers for directory indexes and $MFT
-          data into one by setting the index_block_size{_bits} of the ntfs
-          inode for $MFT to the mft_record_size{_bits} of the ntfs_volume.
-        - Cleanup aops.c, update comments.
-        - Make ntfs_file_get_block() use map_run_list() so all files now
-          support attribute lists.
-        - Make ntfs_dir_readpage() almost verbatim copy of
-          block_read_full_page() by using ntfs_file_get_block() with only real
-          difference being the use of our own async io completion handler
-          rather than the default one, thus reducing the amount of code and
-          automatically enabling attribute list support for directory indices.
-        - Fix bug in load_attribute_list() - forgot to call brelse in error
-          code path.
-        - Change parameters to find_attr() and lookup_attr(). We no longer
-          pass in the upcase table and its length. These can be gotten from
-          ctx->ntfs_ino->vol->upcase{_len}. Update all callers.
-        - Cleanups in attrib.c.
-        - Implement merging of runlists, attrib.c::merge_run_lists() and its
-          helpers. (Richard Russon)
-        - Attribute lists part 2, attribute extents and multi part runlists:
-          enable proper support for LCN_RL_NOT_MAPPED and automatic mapping of
-          further runlist parts via attrib.c::map_run_list().
-        - Tiny endianness bug fix in decompress_mapping_pairs().
-tng-0.0.6 - Encrypted directories, bug fixes, cleanups, debugging enhancements.
-        - Enable encrypted directories. (Their index root is marked encrypted
-          to indicate that new files in that directory should be created
-          encrypted.)
-        - Fix bug in NInoBmpNonResident() macro. (Cut and paste error.)
-        - Enable $Extend system directory. Most (if not all) extended system
-          files do not have unnamed data attributes so ntfs_read_inode() had to
-          special case them but that is ok, as the special casing recovery
-          happens inside an error code path so there is zero slow down in the
-          normal fast path. The special casing is done by introducing a new
-          function inode.c::ntfs_is_extended_system_file() which checks if any
-          of the hard links in the inode point to $Extend as being their parent
-          directory and if they do we assume this is an extended system file.
-        - Create a sysctl/proc interface to allow {dis,en}abling of debug output
-          when compiled with -DDEBUG. Default is debug messages to be disabled.
-          To enable them, one writes a non-zero value to /proc/sys/fs/ntfs-debug
-          (if /proc is enabled) or uses sysctl(2) to effect the same (if sysctl
-          interface is enabled). Inspired by old ntfs driver.
-        - Add debug_msgs insmod/kernel boot parameter to set whether debug
-          messages are {dis,en}abled. This is useful to enable debug messages
-          during ntfs initialization and is the only way to activate debugging
-          when the sysctl interface is not enabled.
-        - Cleanup debug output in various places.
-        - Remove all dollar signs ($) from the source (except comments) to
-          enable compilation on architectures whose gcc compiler does not
-          support dollar signs in the names of variables/constants. Attribute
-          types now start with AT_ instead of $ and $I30 is now just I30.
-        - Cleanup ntfs_lookup() and add consistency check of sequence numbers.
-        - Load complete runlist for $MFT/$BITMAP during mount and cleanup
-          access functions. This means we now cope with $MFT/$BITMAP being
-          spread accross several mft records.
-        - Disable modification of mft_zone_multiplier on remount. We can always
-          reenable this later on if we really want to, but we will need to make
-          sure we readjust the mft_zone size / layout accordingly.
-tng-0.0.5 - Modernize for 2.5.x and further in line-ing with Al Viro's comments.
-        - Use sb_set_blocksize() instead of set_blocksize() and verify the
-          return value.
-        - Use sb_bread() instead of bread() throughout.
-        - Add index_vcn_size{_bits} to ntfs_inode structure to store the size
-          of a directory index block vcn. Apply resulting simplifications in
-          dir.c everywhere.
-        - Fix a small bug somewhere (but forgot what it was).
-        - Change ntfs_{debug,error,warning} to enable gcc to do type checking
-          on the printf-format parameter list and fix bugs reported by gcc
-          as a result. (Richard Russon)
-        - Move inode allocation strategy to Al's new stuff but maintain the
-          divorce of ntfs_inode from struct inode. To achieve this we have two
-          separate slab caches, one for big ntfs inodes containing a struct
-          inode and pure ntfs inodes and at the same time fix some faulty
-          error code paths in ntfs_read_inode().
-        - Show mount options in proc (inode.c::ntfs_show_options()).
-tng-0.0.4 - Big changes, getting in line with Al Viro's comments.
-        - Modified (un)map_mft_record functions to be common for read and write
-          case. To specify which is which, added extra parameter at front of
-          parameter list. Pass either READ or WRITE to this, each has the
-          obvious meaning.
-        - General cleanups to allow for easier folding in vi.
-        - attrib.c::decompress_mapping_pairs() now accepts the old runlist
-          argument, and invokes attrib.c::merge_run_lists() to merge the old
-          and the new runlists.
-        - Removed attrib.c::find_first_attr().
-        - Implemented loading of attribute list and complete runlist for $MFT.
-          This means we now cope with $MFT being spread across several mft
-          records.
-        - Adapt to 2.5.2-pre9 and the changed create_empty_buffers() syntax.
-        - Adapt major/minor/kdev_t/[bk]devname stuff to new 2.5.x kernels.
-        - Make ntfs_volume be allocated via kmalloc() instead of using a slab
-          cache. There are too little ntfs_volume structures at any one time
-          to justify a private slab cache.
-        - Fix bogus kmap() use in async io completion. Now use kmap_atomic().
-          Use KM_BIO_IRQ on advice from IRC/kernel...
-        - Use ntfs_map_page() in map_mft_record() and create ->readpage method
-          for reading $MFT (ntfs_mft_readpage). In the process create dedicated
-          address space operations (ntfs_mft_aops) for $MFT inode mapping. Also
-          removed the now superfluous exports from the kernel core patch.
-        - Fix a bug where kfree() was used instead of ntfs_free().
-        - Change map_mft_record() to take ntfs_inode as argument instead of
-          vfs inode. Dito for unmap_mft_record(). Adapt all callers.
-        - Add pointer to ntfs_volume to ntfs_inode.
-        - Add mft record number and sequence number to ntfs_inode. Stop using
-          i_ino and i_generation for in-driver purposes.
-        - Implement attrib.c::merge_run_lists(). (Richard Russon)
-        - Remove use of proper inodes by extent inodes. Move i_ino and
-          i_generation to ntfs_inode to do this. Apply simplifications that
-          result and remove iget_no_wait(), etc.
-        - Pass ntfs_inode everywhere in the driver (used to be struct inode).
-        - Add reference counting in ntfs_inode for the ntfs inode itself and
-          for the mapped mft record.
-        - Extend mft record mapping so we can (un)map extent mft records (new
-          functions (un)map_extent_mft_record), and so mappings are reference
-          counted and don't have to happen twice if already mapped - just ref
-          count increases.
-        - Add -o iocharset as alias to -o nls for backwards compatibility.
-        - The latest core patch is now tiny. In fact just a single additional
-          export is necessary over the base kernel.
-tng-0.0.3 - Cleanups, enhancements, bug fixes.
-        - Work on attrib.c::decompress_mapping_pairs() to detect base extents
-          and setup the runlist appropriately using knowledge provided by the
-          sizes in the base attribute record.
-        - Balance the get_/put_attr_search_ctx() calls so we don't leak memory
-          any more.
-        - Introduce ntfs_malloc_nofs() and ntfs_free() to allocate/free a single
-          page or use vmalloc depending on the amount of memory requested.
-        - Cleanup error output. The __FUNCTION__ "(): " is now added
-          automatically. Introduced a new header file debug.h to support this
-          and also moved ntfs_debug() function into it.
-        - Make reading of compressed files more intelligent and especially get
-          rid of the vmalloc_nofs() from readpage(). This now uses per CPU
-          buffers (allocated at first mount with cluster size <= 4kiB and
-          deallocated on last umount with cluster size <= 4kiB), and
-          asynchronous io for the compressed data using a list of buffer heads.
-          Er, we use synchronous io as async io only works on whole pages
-          covered by buffers and not on individual buffer heads...
-        - Bug fix for reading compressed files with sparse compression blocks.
-tng-0.0.2 - Now handles larger/fragmented/compressed volumes/files/dirs.
-        - Fixed handling of directories when cluster size exceeds index block
-          size.
-        - Hide DOS only name space directory entries from readdir() but allow
-          them in lookup(). This should fix the problem that Linux doesn't
-          support directory hard links, while still allowing access to entries
-          via their short file name. This also has the benefit of mimicking
-          what Windows users are used to, so it is the ideal solution.
-        - Implemented sync_page everywhere so no more hangs in D state when
-          waiting for a page.
-        - Stop using bforget() in favour of brelse().
-        - Stop locking buffers unnecessarily.
-        - Implemented compressed files (inode->mapping contains uncompressed
-          data, raw compressed data is currently bread() into a vmalloc()ed
-          memory buffer).
-        - Enable compressed directories. (Their index root is marked compressed
-          to indicate that new files in that directory should be created
-          compressed.)
-        - Use vsnprintf rather than vsprintf in the ntfs_error and ntfs_warning
-          functions. (Thanks to Will Dyson for pointing this out.)
-        - Moved the ntfs_inode and ntfs_volume (the former ntfs_inode_info and
-          ntfs_sb_info) out of the common inode and super_block structures and
-          started using the generic_ip and generic_sbp pointers instead. This
-          makes ntfs entirely private with respect to the kernel tree.
-        - Detect compiler version and abort with error message if gcc less than
-          2.96 is used.
-        - Fix bug in name comparison function in unistr.c.
-        - Implement attribute lists part 1, the infrastructure: search contexts
-          and operations, find_external_attr(), lookup_attr()) and make the
-          code use the infrastructure.
-        - Fix stupid buffer overflow bug that became apparent on larger run
-          list containing attributes.
-        - Fix bugs in readdir() that became apparent on larger directories.
-        The driver is now really useful and survives the test
-                find . -type f -exec md5sum "{}" \;
-        without any error messages on a over 1GiB sized partition with >16k
-        files on it, including compressed files and directories and many files
-        and directories with attribute lists.
-tng-0.0.1 - The first useful version.
-        - Added ntfs_lookup().
-        - Added default upcase generation and handling.
-        - Added compile options to be shown on module init.
-        - Many bug fixes that were "hidden" before.
-        - Update to latest kernel.
-        - Added ntfs_readdir().
-        - Added file operations for mmap(), read(), open() and llseek(). We just
-          use the generic ones. The whole point of going through implementing
-          readpage() methods and where possible get_block() call backs is that
-          this allows us to make use of the generic high level methods provided
-          by the kernel.
-        The driver is now actually useful! Yey. (-: It undoubtedly has got bugs
-        though and it doesn't implement accesssing compressed files yet. Also,
-        accessing files with attribute list attributes is not implemented yet
-        either. But for small or simple filesystems it should work and allow
-        you to list directories, use stat on directory entries and the file
-        system, open, read, mmap and llseek around in files. A big mile stone
-        has been reached!
-tng-0.0.0 - Initial version tag.
-        Initial driver implementation. The driver can mount and umount simple
-        NTFS filesystems (i.e. ones without attribute lists in the system
-        files). If the mount fails there might be problems in the error handling
-        code paths, so be warned. Otherwise it seems to be loading the system
-        files nicely and the mft record read mapping/unmapping seems to be
-        working nicely, too. Proof of inode metadata in the page cache and non-
-        resident file unnamed stream data in the page cache concepts is thus
-        complete.
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cfce53cb65d7..c3c2c7ac9020 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -23,6 +23,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 50d3b0c258e3..f5094ee224c1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -22,6 +22,7 @@
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 08f7530e9341..6551c7cbad92 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -25,6 +25,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 #include "attrib.h"
 #include "inode.h"
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 9173e82a45d1..fe44d3feee4a 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -21,6 +21,7 @@
 */
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include "dir.h"
 #include "aops.h"
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index b681c71d7069..8804f093ba75 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -20,6 +20,7 @@
 */
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
 #include <linux/sched.h>
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 2194eff49743..096c135691ae 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -19,6 +19,8 @@
 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#include <linux/slab.h>
 #include "aops.h"
 #include "collate.h"
 #include "debug.h"
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 1caa0ef0b2bb..b572b6727181 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -21,6 +21,7 @@
 */
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include "attrib.h"
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 2ca00153b6ec..358273e59ade 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -23,6 +23,7 @@
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 #include "attrib.h"
 #include "debug.h"
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 1cf39dfaee7a..0de1db6cddbf 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -31,6 +31,7 @@
 #include <linux/vfs.h>
 #include <linux/moduleparam.h>
 #include <linux/smp_lock.h>
+#include <linux/bitmap.h>
 #include "sysctl.h"
 #include "logfile.h"
@@ -2458,7 +2459,6 @@ static void ntfs_put_super(struct super_block *sb)
 static s64 get_nr_free_clusters(ntfs_volume *vol)
 {
        s64 nr_free = vol->nr_clusters;
-        u32 *kaddr;
        struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
        struct page *page;
        pgoff_t index, max_index;
@@ -2477,7 +2477,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
        ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
                        max_index, PAGE_CACHE_SIZE / 4);
        for (index = 0; index < max_index; index++) {
-                unsigned int i;
+                unsigned long *kaddr;
                /*
                 * Read the page from page cache, getting it from backing store
                 * if necessary, and increment the use count.
@@ -2490,16 +2491,16 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
                        nr_free -= PAGE_CACHE_SIZE * 8;
                        continue;
                }
-                kaddr = (u32*)kmap_atomic(page, KM_USER0);
+                kaddr = kmap_atomic(page, KM_USER0);
                /*
-                 * For each 4 bytes, subtract the number of set bits. If this
+                 * Subtract the number of set bits. If this
                 * is the last page and it is partial we don't really care as
                 * it just means we do a little extra work but it won't affect
                 * the result as all out of range bytes are set to zero by
                 * ntfs_readpage().
                 */
-                for (i = 0; i < PAGE_CACHE_SIZE / 4; i++)
+                nr_free -= bitmap_weight(kaddr,
-                        nr_free -= (s64)hweight32(kaddr[i]);
+                                        PAGE_CACHE_SIZE * BITS_PER_BYTE);
                kunmap_atomic(kaddr, KM_USER0);
                page_cache_release(page);
        }
@@ -2538,7 +2539,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
                s64 nr_free, const pgoff_t max_index)
 {
-        u32 *kaddr;
        struct address_space *mapping = vol->mftbmp_ino->i_mapping;
        struct page *page;
        pgoff_t index;
@@ -2548,7 +2548,8 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
        ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
                        "0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
        for (index = 0; index < max_index; index++) {
-                unsigned int i;
+                unsigned long *kaddr;
                /*
                 * Read the page from page cache, getting it from backing store
                 * if necessary, and increment the use count.
@@ -2561,16 +2562,16 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
                        nr_free -= PAGE_CACHE_SIZE * 8;
                        continue;
                }
-                kaddr = (u32*)kmap_atomic(page, KM_USER0);
+                kaddr = kmap_atomic(page, KM_USER0);
                /*
-                 * For each 4 bytes, subtract the number of set bits. If this
+                 * Subtract the number of set bits. If this
                 * is the last page and it is partial we don't really care as
                 * it just means we do a little extra work but it won't affect
                 * the result as all out of range bytes are set to zero by
                 * ntfs_readpage().
                 */
-                for (i = 0; i < PAGE_CACHE_SIZE / 4; i++)
+                nr_free -= bitmap_weight(kaddr,
-                        nr_free -= (s64)hweight32(kaddr[i]);
+                                        PAGE_CACHE_SIZE * BITS_PER_BYTE);
                kunmap_atomic(kaddr, KM_USER0);
                page_cache_release(page);
        }
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 791c0886c060..07d9fd854350 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -29,6 +29,7 @@ ocfs2-objs := \
        mmap.o                  \
        namei.o                 \
        refcounttree.o          \
+        reservations.o          \
        resize.o                \
        slot_map.o              \
        suballoc.o              \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0501974bedd0..e13fc9e8fcdc 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #define MLOG_MASK_PREFIX ML_INODE
@@ -30,6 +31,8 @@
 #include "alloc.h"
 #include "dlmglue.h"
 #include "file.h"
+#include "inode.h"
+#include "journal.h"
 #include "ocfs2_fs.h"
 #include "xattr.h"
@@ -166,6 +169,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
 }
 /*
+ * Helper function to set i_mode in memory and disk. Some call paths
+ * will not have di_bh or a journal handle to pass, in which case it
+ * will create it's own.
+ */
+static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
+                              handle_t *handle, umode_t new_mode)
+{
+        int ret, commit_handle = 0;
+        struct ocfs2_dinode *di;
+        if (di_bh == NULL) {
+                ret = ocfs2_read_inode_block(inode, &di_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        } else
+                get_bh(di_bh);
+        if (handle == NULL) {
+                handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+                                           OCFS2_INODE_UPDATE_CREDITS);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        mlog_errno(ret);
+                        goto out_brelse;
+                }
+                commit_handle = 1;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        inode->i_mode = new_mode;
+        di->i_mode = cpu_to_le16(inode->i_mode);
+        ocfs2_journal_dirty(handle, di_bh);
+out_commit:
+        if (commit_handle)
+                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out_brelse:
+        brelse(di_bh);
+out:
+        return ret;
+}
+/*
 * Set the access or default ACL of an inode.
 */
 static int ocfs2_set_acl(handle_t *handle,
@@ -193,9 +250,14 @@ static int ocfs2_set_acl(handle_t *handle,
                        if (ret < 0)
                                return ret;
                        else {
-                                inode->i_mode = mode;
                                if (ret == 0)
                                        acl = NULL;
+                                ret = ocfs2_acl_set_mode(inode, di_bh,
+                                                         handle, mode);
+                                if (ret)
+                                        return ret;
                        }
                }
                break;
@@ -283,6 +345,7 @@ int ocfs2_init_acl(handle_t *handle,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct posix_acl *acl = NULL;
        int ret = 0;
+        mode_t mode;
        if (!S_ISLNK(inode->i_mode)) {
                if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -291,12 +354,17 @@ int ocfs2_init_acl(handle_t *handle,
                        if (IS_ERR(acl))
                                return PTR_ERR(acl);
                }
-                if (!acl)
+                if (!acl) {
-                        inode->i_mode &= ~current_umask();
+                        mode = inode->i_mode & ~current_umask();
+                        ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto cleanup;
+                        }
+                }
        }
        if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
                struct posix_acl *clone;
-                mode_t mode;
                if (S_ISDIR(inode->i_mode)) {
                        ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -313,7 +381,7 @@ int ocfs2_init_acl(handle_t *handle,
                mode = inode->i_mode;
                ret = posix_acl_create_masq(clone, &mode);
                if (ret >= 0) {
-                        inode->i_mode = mode;
+                        ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
                        if (ret > 0) {
                                ret = ocfs2_set_acl(handle, inode,
                                                    di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9f8bd913c51e..215e12ce1d85 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1006,7 +1006,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
        int count, status, i;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 first_blkno;
+        u64 suballoc_loc, first_blkno;
        struct ocfs2_super *osb =
                OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
        struct ocfs2_extent_block *eb;
@@ -1015,10 +1015,10 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
        count = 0;
        while (count < wanted) {
-                status = ocfs2_claim_metadata(osb,
+                status = ocfs2_claim_metadata(handle,
-                                              handle,
                                              meta_ac,
                                              wanted - count,
+                                              &suballoc_loc,
                                              &suballoc_bit_start,
                                              &num_got,
                                              &first_blkno);
@@ -1052,6 +1052,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
                        eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
                        eb->h_suballoc_slot =
                                cpu_to_le16(meta_ac->ac_alloc_slot);
+                        eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
                        eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
                        eb->h_list.l_count =
                                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1061,11 +1062,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
                        /* We'll also be dirtied by the caller, so
                         * this isn't absolutely necessary. */
-                        status = ocfs2_journal_dirty(handle, bhs[i]);
+                        ocfs2_journal_dirty(handle, bhs[i]);
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
                }
                count += num_got;
@@ -1129,8 +1126,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
                goto out;
        }
-        status = ocfs2_extend_trans(handle, path_num_items(path) +
+        status = ocfs2_extend_trans(handle, path_num_items(path));
-                                    handle->h_buffer_credits);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -1270,12 +1266,7 @@ static int ocfs2_add_branch(handle_t *handle,
                if (!eb_el->l_tree_depth)
                        new_last_eb_blk = le64_to_cpu(eb->h_blkno);
-                status = ocfs2_journal_dirty(handle, bh);
+                ocfs2_journal_dirty(handle, bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                next_blkno = le64_to_cpu(eb->h_blkno);
        }
@@ -1321,17 +1312,10 @@ static int ocfs2_add_branch(handle_t *handle,
        eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
        eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
-        status = ocfs2_journal_dirty(handle, *last_eb_bh);
+        ocfs2_journal_dirty(handle, *last_eb_bh);
-        if (status < 0)
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-                mlog_errno(status);
+        if (eb_bh)
-        status = ocfs2_journal_dirty(handle, et->et_root_bh);
+                ocfs2_journal_dirty(handle, eb_bh);
-        if (status < 0)
-                mlog_errno(status);
-        if (eb_bh) {
-                status = ocfs2_journal_dirty(handle, eb_bh);
-                if (status < 0)
-                        mlog_errno(status);
-        }
        /*
         * Some callers want to track the rightmost leaf so pass it
@@ -1399,11 +1383,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
        for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
                eb_el->l_recs[i] = root_el->l_recs[i];
-        status = ocfs2_journal_dirty(handle, new_eb_bh);
+        ocfs2_journal_dirty(handle, new_eb_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = ocfs2_et_root_journal_access(handle, et,
                                              OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1428,11 +1408,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
        if (root_el->l_tree_depth == cpu_to_le16(1))
                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
-        status = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        *ret_new_eb_bh = new_eb_bh;
        new_eb_bh = NULL;
@@ -2064,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
                                       struct ocfs2_path *right_path,
                                       int subtree_index)
 {
-        int ret, i, idx;
+        int i, idx;
        struct ocfs2_extent_list *el, *left_el, *right_el;
        struct ocfs2_extent_rec *left_rec, *right_rec;
        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
@@ -2102,13 +2078,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
                ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
                                              right_el);
-                ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+                ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
-                if (ret)
+                ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
-                        mlog_errno(ret);
-                ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
-                if (ret)
-                        mlog_errno(ret);
                /*
                 * Setup our list pointers now so that the current
@@ -2132,9 +2103,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
        root_bh = left_path->p_node[subtree_index].bh;
-        ret = ocfs2_journal_dirty(handle, root_bh);
+        ocfs2_journal_dirty(handle, root_bh);
-        if (ret)
-                mlog_errno(ret);
 }
 static int ocfs2_rotate_subtree_right(handle_t *handle,
@@ -2207,11 +2176,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
        ocfs2_create_empty_extent(right_el);
-        ret = ocfs2_journal_dirty(handle, right_leaf_bh);
+        ocfs2_journal_dirty(handle, right_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        /* Do the copy now. */
        i = le16_to_cpu(left_el->l_next_free_rec) - 1;
@@ -2230,11 +2195,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
        memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
        le16_add_cpu(&left_el->l_next_free_rec, 1);
-        ret = ocfs2_journal_dirty(handle, left_leaf_bh);
+        ocfs2_journal_dirty(handle, left_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        ocfs2_complete_edge_insert(handle, left_path, right_path,
                                   subtree_index);
@@ -2249,8 +2210,8 @@ out:
 *
 * Will return zero if the path passed in is already the leftmost path.
 */
-static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
-                                         struct ocfs2_path *path, u32 *cpos)
+                                  struct ocfs2_path *path, u32 *cpos)
 {
        int i, j, ret = 0;
        u64 blkno;
@@ -2327,20 +2288,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
                                           int op_credits,
                                           struct ocfs2_path *path)
 {
-        int ret;
+        int ret = 0;
        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
-        if (handle->h_buffer_credits < credits) {
+        if (handle->h_buffer_credits < credits)
                ret = ocfs2_extend_trans(handle,
                                         credits - handle->h_buffer_credits);
-                if (ret)
-                        return ret;
-                if (unlikely(handle->h_buffer_credits < credits))
+        return ret;
-                        return ocfs2_extend_trans(handle, credits);
-        }
-        return 0;
 }
 /*
@@ -2584,8 +2539,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
         * records for all the bh in the path.
         * So we have to allocate extra credits and access them.
         */
-        ret = ocfs2_extend_trans(handle,
+        ret = ocfs2_extend_trans(handle, subtree_index);
-                                 handle->h_buffer_credits + subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2823,12 +2777,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
                ocfs2_remove_empty_extent(right_leaf_el);
        }
-        ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+        ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
-        if (ret)
+        ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-                mlog_errno(ret);
-        ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-        if (ret)
-                mlog_errno(ret);
        if (del_right_subtree) {
                ocfs2_unlink_subtree(handle, et, left_path, right_path,
@@ -2851,9 +2801,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
                if (right_has_empty)
                        ocfs2_remove_empty_extent(left_leaf_el);
-                ret = ocfs2_journal_dirty(handle, et_root_bh);
+                ocfs2_journal_dirty(handle, et_root_bh);
-                if (ret)
-                        mlog_errno(ret);
                *deleted = 1;
        } else
@@ -2962,10 +2910,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
        }
        ocfs2_remove_empty_extent(el);
+        ocfs2_journal_dirty(handle, bh);
-        ret = ocfs2_journal_dirty(handle, bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        return ret;
@@ -3506,15 +3451,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
        ocfs2_cleanup_merge(el, index);
-        ret = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (ret)
-                mlog_errno(ret);
        if (right_path) {
-                ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+                ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-                if (ret)
-                        mlog_errno(ret);
                ocfs2_complete_edge_insert(handle, left_path, right_path,
                                           subtree_index);
        }
@@ -3683,14 +3622,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
        ocfs2_cleanup_merge(el, index);
-        ret = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (ret)
-                mlog_errno(ret);
        if (left_path) {
-                ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+                ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
-                if (ret)
-                        mlog_errno(ret);
                /*
                 * In the situation that the right_rec is empty and the extent
@@ -4016,10 +3950,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
                le32_add_cpu(&rec->e_int_clusters,
                             -le32_to_cpu(rec->e_cpos));
-                ret = ocfs2_journal_dirty(handle, bh);
+                ocfs2_journal_dirty(handle, bh);
-                if (ret)
-                        mlog_errno(ret);
        }
 }
@@ -4203,17 +4134,13 @@ static int ocfs2_insert_path(handle_t *handle,
        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
        if (left_path) {
-                int credits = handle->h_buffer_credits;
                /*
                 * There's a chance that left_path got passed back to
                 * us without being accounted for in the
                 * journal. Extend our transaction here to be sure we
                 * can change those blocks.
                 */
-                credits += left_path->p_tree_depth;
+                ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
-                ret = ocfs2_extend_trans(handle, credits);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -4251,17 +4178,13 @@ static int ocfs2_insert_path(handle_t *handle,
                 * dirty this for us.
                 */
                if (left_path)
-                        ret = ocfs2_journal_dirty(handle,
+                        ocfs2_journal_dirty(handle,
-                                                  path_leaf_bh(left_path));
+                                            path_leaf_bh(left_path));
-                        if (ret)
-                                mlog_errno(ret);
        } else
                ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
                                     insert);
-        ret = ocfs2_journal_dirty(handle, leaf_bh);
+        ocfs2_journal_dirty(handle, leaf_bh);
-        if (ret)
-                mlog_errno(ret);
        if (left_path) {
                /*
@@ -4384,9 +4307,7 @@ out_update_clusters:
                ocfs2_et_update_clusters(et,
                                         le16_to_cpu(insert_rec->e_leaf_clusters));
-        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        ocfs2_free_path(left_path);
@@ -4866,7 +4787,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                goto leave;
        }
-        status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+        status = __ocfs2_claim_clusters(handle, data_ac, 1,
                                        clusters_to_add, &bit_off, &num_bits);
        if (status < 0) {
                if (status != -ENOSPC)
@@ -4895,11 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        clusters_to_add -= num_bits;
        *logical_offset += num_bits;
@@ -5309,7 +5226,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
                            int index, u32 new_range,
                            struct ocfs2_alloc_context *meta_ac)
 {
-        int ret, depth, credits = handle->h_buffer_credits;
+        int ret, depth, credits;
        struct buffer_head *last_eb_bh = NULL;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *rightmost_el, *el;
@@ -5340,8 +5257,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
        } else
                rightmost_el = path_leaf_el(path);
-        credits += path->p_tree_depth +
+        credits = path->p_tree_depth +
-                   ocfs2_extend_meta_needed(et->et_root_el);
+                  ocfs2_extend_meta_needed(et->et_root_el);
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -5671,19 +5588,97 @@ out:
        return ret;
 }
+/*
+ * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
+ * same as ocfs2_lock_alloctors(), except for it accepts a blocks
+ * number to reserve some extra blocks, and it only handles meta
+ * data allocations.
+ *
+ * Currently, only ocfs2_remove_btree_range() uses it for truncating
+ * and punching holes.
+ */
+static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
+                                              struct ocfs2_extent_tree *et,
+                                              u32 extents_to_split,
+                                              struct ocfs2_alloc_context **ac,
+                                              int extra_blocks)
+{
+        int ret = 0, num_free_extents;
+        unsigned int max_recs_needed = 2 * extents_to_split;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        *ac = NULL;
+        num_free_extents = ocfs2_num_free_extents(osb, et);
+        if (num_free_extents < 0) {
+                ret = num_free_extents;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (!num_free_extents ||
+            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+                extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+        if (extra_blocks) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
+                if (ret < 0) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        if (ret) {
+                if (*ac) {
+                        ocfs2_free_alloc_context(*ac);
+                        *ac = NULL;
+                }
+        }
+        return ret;
+}
 int ocfs2_remove_btree_range(struct inode *inode,
                             struct ocfs2_extent_tree *et,
-                             u32 cpos, u32 phys_cpos, u32 len,
+                             u32 cpos, u32 phys_cpos, u32 len, int flags,
-                             struct ocfs2_cached_dealloc_ctxt *dealloc)
+                             struct ocfs2_cached_dealloc_ctxt *dealloc,
+                             u64 refcount_loc)
 {
-        int ret;
+        int ret, credits = 0, extra_blocks = 0;
        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct inode *tl_inode = osb->osb_tl_inode;
        handle_t *handle;
        struct ocfs2_alloc_context *meta_ac = NULL;
+        struct ocfs2_refcount_tree *ref_tree = NULL;
+        if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
+                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+                         OCFS2_HAS_REFCOUNT_FL));
+                ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+                                               &ref_tree, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
-        ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+                ret = ocfs2_prepare_refcount_change_for_del(inode,
+                                                            refcount_loc,
+                                                            phys_blkno,
+                                                            len,
+                                                            &credits,
+                                                            &extra_blocks);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
+                                                 extra_blocks);
        if (ret) {
                mlog_errno(ret);
                return ret;
@@ -5699,7 +5694,8 @@ int ocfs2_remove_btree_range(struct inode *inode,
                }
        }
-        handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+        handle = ocfs2_start_trans(osb,
+                        ocfs2_remove_extent_credits(osb->sb) + credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
@@ -5724,15 +5720,22 @@ int ocfs2_remove_btree_range(struct inode *inode,
        ocfs2_et_update_clusters(et, -len);
-        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        if (phys_blkno) {
-        if (ret)
+                if (flags & OCFS2_EXT_REFCOUNTED)
-                mlog_errno(ret);
+                        ret = ocfs2_decrease_refcount(inode, handle,
+                                        ocfs2_blocks_to_clusters(osb->sb,
+                                                                 phys_blkno),
+                                        len, meta_ac,
+                                        dealloc, 1);
+                else
+                        ret = ocfs2_truncate_log_append(osb, handle,
+                                                        phys_blkno, len);
+                if (ret)
+                        mlog_errno(ret);
+        }
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -5742,6 +5745,9 @@ out:
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
+        if (ref_tree)
+                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
        return ret;
 }
@@ -5850,11 +5856,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        }
        tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
-        status = ocfs2_journal_dirty(handle, tl_bh);
+        ocfs2_journal_dirty(handle, tl_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
 bail:
        mlog_exit(status);
@@ -5893,11 +5895,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                tl->tl_used = cpu_to_le16(i);
-                status = ocfs2_journal_dirty(handle, tl_bh);
+                ocfs2_journal_dirty(handle, tl_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                /* TODO: Perhaps we can calculate the bulk of the
                 * credits up front rather than extending like
@@ -6298,6 +6296,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 */
 struct ocfs2_cached_block_free {
        struct ocfs2_cached_block_free          *free_next;
+        u64                                     free_bg;
        u64                                     free_blk;
        unsigned int                            free_bit;
 };
@@ -6344,8 +6343,11 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
        }
        while (head) {
-                bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+                if (head->free_bg)
-                                                      head->free_bit);
+                        bg_blkno = head->free_bg;
+                else
+                        bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+                                                              head->free_bit);
                mlog(0, "Free bit: (bit %u, blkno %llu)\n",
                     head->free_bit, (unsigned long long)head->free_blk);
@@ -6393,7 +6395,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
        int ret = 0;
        struct ocfs2_cached_block_free *item;
-        item = kmalloc(sizeof(*item), GFP_NOFS);
+        item = kzalloc(sizeof(*item), GFP_NOFS);
        if (item == NULL) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -6533,8 +6535,8 @@ ocfs2_find_per_slot_free_list(int type,
 }
 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-                              int type, int slot, u64 blkno,
+                              int type, int slot, u64 suballoc,
-                              unsigned int bit)
+                              u64 blkno, unsigned int bit)
 {
        int ret;
        struct ocfs2_per_slot_free_list *fl;
@@ -6547,7 +6549,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
                goto out;
        }
-        item = kmalloc(sizeof(*item), GFP_NOFS);
+        item = kzalloc(sizeof(*item), GFP_NOFS);
        if (item == NULL) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -6557,6 +6559,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
        mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
             type, slot, bit, (unsigned long long)blkno);
+        item->free_bg = suballoc;
        item->free_blk = blkno;
        item->free_bit = bit;
        item->free_next = fl->f_first;
@@ -6573,433 +6576,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 {
        return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
                                         le16_to_cpu(eb->h_suballoc_slot),
+                                         le64_to_cpu(eb->h_suballoc_loc),
                                         le64_to_cpu(eb->h_blkno),
                                         le16_to_cpu(eb->h_suballoc_bit));
 }
-/* This function will figure out whether the currently last extent
- * block will be deleted, and if it will, what the new last extent
- * block will be so we can update his h_next_leaf_blk field, as well
- * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct inode *inode,
-                                       unsigned int clusters_to_del,
-                                       struct ocfs2_path *path,
-                                       struct buffer_head **new_last_eb)
-{
-        int next_free, ret = 0;
-        u32 cpos;
-        struct ocfs2_extent_rec *rec;
-        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_list *el;
-        struct buffer_head *bh = NULL;
-        *new_last_eb = NULL;
-        /* we have no tree, so of course, no last_eb. */
-        if (!path->p_tree_depth)
-                goto out;
-        /* trunc to zero special case - this makes tree_depth = 0
-         * regardless of what it is.  */
-        if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
-                goto out;
-        el = path_leaf_el(path);
-        BUG_ON(!el->l_next_free_rec);
-        /*
-         * Make sure that this extent list will actually be empty
-         * after we clear away the data. We can shortcut out if
-         * there's more than one non-empty extent in the
-         * list. Otherwise, a check of the remaining extent is
-         * necessary.
-         */
-        next_free = le16_to_cpu(el->l_next_free_rec);
-        rec = NULL;
-        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-                if (next_free > 2)
-                        goto out;
-                /* We may have a valid extent in index 1, check it. */
-                if (next_free == 2)
-                        rec = &el->l_recs[1];
-                /*
-                 * Fall through - no more nonempty extents, so we want
-                 * to delete this leaf.
-                 */
-        } else {
-                if (next_free > 1)
-                        goto out;
-                rec = &el->l_recs[0];
-        }
-        if (rec) {
-                /*
-                 * Check it we'll only be trimming off the end of this
-                 * cluster.
-                 */
-                if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
-                        goto out;
-        }
-        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        eb = (struct ocfs2_extent_block *) bh->b_data;
-        el = &eb->h_list;
-        /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
-         * Any corruption is a code bug. */
-        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-        *new_last_eb = bh;
-        get_bh(*new_last_eb);
-        mlog(0, "returning block %llu, (cpos: %u)\n",
-             (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
-out:
-        brelse(bh);
-        return ret;
-}
-/*
- * Trim some clusters off the rightmost edge of a tree. Only called
- * during truncate.
- *
- * The caller needs to:
- *   - start journaling of each path component.
- *   - compute and fully set up any new last ext block
- */
-static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
-                           handle_t *handle, struct ocfs2_truncate_context *tc,
-                           u32 clusters_to_del, u64 *delete_start, u8 *flags)
-{
-        int ret, i, index = path->p_tree_depth;
-        u32 new_edge = 0;
-        u64 deleted_eb = 0;
-        struct buffer_head *bh;
-        struct ocfs2_extent_list *el;
-        struct ocfs2_extent_rec *rec;
-        *delete_start = 0;
-        *flags = 0;
-        while (index >= 0) {
-                bh = path->p_node[index].bh;
-                el = path->p_node[index].el;
-                mlog(0, "traveling tree (index = %d, block = %llu)\n",
-                     index,  (unsigned long long)bh->b_blocknr);
-                BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
-                if (index !=
-                    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
-                        ocfs2_error(inode->i_sb,
-                                    "Inode %lu has invalid ext. block %llu",
-                                    inode->i_ino,
-                                    (unsigned long long)bh->b_blocknr);
-                        ret = -EROFS;
-                        goto out;
-                }
-find_tail_record:
-                i = le16_to_cpu(el->l_next_free_rec) - 1;
-                rec = &el->l_recs[i];
-                mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
-                     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
-                     ocfs2_rec_clusters(el, rec),
-                     (unsigned long long)le64_to_cpu(rec->e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
-                if (le16_to_cpu(el->l_tree_depth) == 0) {
-                        /*
-                         * If the leaf block contains a single empty
-                         * extent and no records, we can just remove
-                         * the block.
-                         */
-                        if (i == 0 && ocfs2_is_empty_extent(rec)) {
-                                memset(rec, 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                el->l_next_free_rec = cpu_to_le16(0);
-                                goto delete;
-                        }
-                        /*
-                         * Remove any empty extents by shifting things
-                         * left. That should make life much easier on
-                         * the code below. This condition is rare
-                         * enough that we shouldn't see a performance
-                         * hit.
-                         */
-                        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-                                le16_add_cpu(&el->l_next_free_rec, -1);
-                                for(i = 0;
-                                    i < le16_to_cpu(el->l_next_free_rec); i++)
-                                        el->l_recs[i] = el->l_recs[i + 1];
-                                memset(&el->l_recs[i], 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                /*
-                                 * We've modified our extent list. The
-                                 * simplest way to handle this change
-                                 * is to being the search from the
-                                 * start again.
-                                 */
-                                goto find_tail_record;
-                        }
-                        le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
-                        /*
-                         * We'll use "new_edge" on our way back up the
-                         * tree to know what our rightmost cpos is.
-                         */
-                        new_edge = le16_to_cpu(rec->e_leaf_clusters);
-                        new_edge += le32_to_cpu(rec->e_cpos);
-                        /*
-                         * The caller will use this to delete data blocks.
-                         */
-                        *delete_start = le64_to_cpu(rec->e_blkno)
-                                + ocfs2_clusters_to_blocks(inode->i_sb,
-                                        le16_to_cpu(rec->e_leaf_clusters));
-                        *flags = rec->e_flags;
-                        /*
-                         * If it's now empty, remove this record.
-                         */
-                        if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
-                                memset(rec, 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                le16_add_cpu(&el->l_next_free_rec, -1);
-                        }
-                } else {
-                        if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
-                                memset(rec, 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                le16_add_cpu(&el->l_next_free_rec, -1);
-                                goto delete;
-                        }
-                        /* Can this actually happen? */
-                        if (le16_to_cpu(el->l_next_free_rec) == 0)
-                                goto delete;
-                        /*
-                         * We never actually deleted any clusters
-                         * because our leaf was empty. There's no
-                         * reason to adjust the rightmost edge then.
-                         */
-                        if (new_edge == 0)
-                                goto delete;
-                        rec->e_int_clusters = cpu_to_le32(new_edge);
-                        le32_add_cpu(&rec->e_int_clusters,
-                                     -le32_to_cpu(rec->e_cpos));
-                         /*
-                          * A deleted child record should have been
-                          * caught above.
-                          */
-                         BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
-                }
-delete:
-                ret = ocfs2_journal_dirty(handle, bh);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                mlog(0, "extent list container %llu, after: record %d: "
-                     "(%u, %u, %llu), next = %u.\n",
-                     (unsigned long long)bh->b_blocknr, i,
-                     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
-                     (unsigned long long)le64_to_cpu(rec->e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                /*
-                 * We must be careful to only attempt delete of an
-                 * extent block (and not the root inode block).
-                 */
-                if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
-                        struct ocfs2_extent_block *eb =
-                                (struct ocfs2_extent_block *)bh->b_data;
-                        /*
-                         * Save this for use when processing the
-                         * parent block.
-                         */
-                        deleted_eb = le64_to_cpu(eb->h_blkno);
-                        mlog(0, "deleting this extent block.\n");
-                        ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
-                        BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
-                        BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
-                        BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
-                        ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
-                        /* An error here is not fatal. */
-                        if (ret < 0)
-                                mlog_errno(ret);
-                } else {
-                        deleted_eb = 0;
-                }
-                index--;
-        }
-        ret = 0;
-out:
-        return ret;
-}
-static int ocfs2_do_truncate(struct ocfs2_super *osb,
-                             unsigned int clusters_to_del,
-                             struct inode *inode,
-                             struct buffer_head *fe_bh,
-                             handle_t *handle,
-                             struct ocfs2_truncate_context *tc,
-                             struct ocfs2_path *path,
-                             struct ocfs2_alloc_context *meta_ac)
-{
-        int status;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *last_eb = NULL;
-        struct ocfs2_extent_list *el;
-        struct buffer_head *last_eb_bh = NULL;
-        u64 delete_blk = 0;
-        u8 rec_flags;
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
-                                             path, &last_eb_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        /*
-         * Each component will be touched, so we might as well journal
-         * here to avoid having to handle errors later.
-         */
-        status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        if (last_eb_bh) {
-                status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
-                                                 OCFS2_JOURNAL_ACCESS_WRITE);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-        }
-        el = &(fe->id2.i_list);
-        /*
-         * Lower levels depend on this never happening, but it's best
-         * to check it up here before changing the tree.
-         */
-        if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
-                ocfs2_error(inode->i_sb,
-                            "Inode %lu has an empty extent record, depth %u\n",
-                            inode->i_ino, le16_to_cpu(el->l_tree_depth));
-                status = -EROFS;
-                goto bail;
-        }
-        dquot_free_space_nodirty(inode,
-                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
-                                      clusters_to_del;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        le32_add_cpu(&fe->i_clusters, -clusters_to_del);
-        inode->i_blocks = ocfs2_inode_sector_count(inode);
-        status = ocfs2_trim_tree(inode, path, handle, tc,
-                                 clusters_to_del, &delete_blk, &rec_flags);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
-        if (le32_to_cpu(fe->i_clusters) == 0) {
-                /* trunc to zero is a special case. */
-                el->l_tree_depth = 0;
-                fe->i_last_eb_blk = 0;
-        } else if (last_eb)
-                fe->i_last_eb_blk = last_eb->h_blkno;
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        if (last_eb) {
-                /* If there will be a new last extent block, then by
-                 * definition, there cannot be any leaves to the right of
-                 * him. */
-                last_eb->h_next_leaf_blk = 0;
-                status = ocfs2_journal_dirty(handle, last_eb_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        if (delete_blk) {
-                if (rec_flags & OCFS2_EXT_REFCOUNTED)
-                        status = ocfs2_decrease_refcount(inode, handle,
-                                        ocfs2_blocks_to_clusters(osb->sb,
-                                                                 delete_blk),
-                                        clusters_to_del, meta_ac,
-                                        &tc->tc_dealloc, 1);
-                else
-                        status = ocfs2_truncate_log_append(osb, handle,
-                                                           delete_blk,
-                                                           clusters_to_del);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        status = 0;
-bail:
-        brelse(last_eb_bh);
-        mlog_exit(status);
-        return status;
-}
 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
 {
        set_buffer_uptodate(bh);
@@ -7307,7 +6888,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                        goto out_commit;
                did_quota = 1;
-                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+                data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+                ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
                                           &num);
                if (ret) {
                        mlog_errno(ret);
@@ -7406,26 +6989,29 @@ out:
 */
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
-                          struct buffer_head *fe_bh,
+                          struct buffer_head *di_bh)
-                          struct ocfs2_truncate_context *tc)
 {
-        int status, i, credits, tl_sem = 0;
+        int status = 0, i, flags = 0;
-        u32 clusters_to_del, new_highest_cpos, range;
+        u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
        u64 blkno = 0;
        struct ocfs2_extent_list *el;
-        handle_t *handle = NULL;
+        struct ocfs2_extent_rec *rec;
-        struct inode *tl_inode = osb->osb_tl_inode;
        struct ocfs2_path *path = NULL;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-        struct ocfs2_alloc_context *meta_ac = NULL;
+        struct ocfs2_extent_list *root_el = &(di->id2.i_list);
-        struct ocfs2_refcount_tree *ref_tree = NULL;
+        u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
+        struct ocfs2_extent_tree et;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
        mlog_entry_void();
+        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+        ocfs2_init_dealloc_ctxt(&dealloc);
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
-        path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+        path = ocfs2_new_path(di_bh, &di->id2.i_list,
                              ocfs2_journal_access_di);
        if (!path) {
                status = -ENOMEM;
@@ -7444,8 +7030,6 @@ start:
                goto bail;
        }
-        credits = 0;
        /*
         * Truncate always works against the rightmost tree branch.
         */
@@ -7480,101 +7064,62 @@ start:
        }
        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        range = le32_to_cpu(el->l_recs[i].e_cpos) +
+        rec = &el->l_recs[i];
-                ocfs2_rec_clusters(el, &el->l_recs[i]);
+        flags = rec->e_flags;
-        if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
+        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
-                clusters_to_del = 0;
-        } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
+        if (i == 0 && ocfs2_is_empty_extent(rec)) {
-                clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
+                /*
-                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+                 * Lower levels depend on this never happening, but it's best
+                 * to check it up here before changing the tree.
+                */
+                if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
+                        ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+                                    "extent record, depth %u\n", inode->i_ino,
+                                    le16_to_cpu(root_el->l_tree_depth));
+                        status = -EROFS;
+                        goto bail;
+                }
+                trunc_cpos = le32_to_cpu(rec->e_cpos);
+                trunc_len = 0;
+                blkno = 0;
+        } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
+                /*
+                 * Truncate entire record.
+                 */
+                trunc_cpos = le32_to_cpu(rec->e_cpos);
+                trunc_len = ocfs2_rec_clusters(el, rec);
+                blkno = le64_to_cpu(rec->e_blkno);
        } else if (range > new_highest_cpos) {
-                clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
+                /*
-                                   le32_to_cpu(el->l_recs[i].e_cpos)) -
+                 * Partial truncate. it also should be
-                                  new_highest_cpos;
+                 * the last truncate we're doing.
-                blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
+                 */
-                        ocfs2_clusters_to_blocks(inode->i_sb,
+                trunc_cpos = new_highest_cpos;
-                                ocfs2_rec_clusters(el, &el->l_recs[i]) -
+                trunc_len = range - new_highest_cpos;
-                                clusters_to_del);
+                coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
+                blkno = le64_to_cpu(rec->e_blkno) +
+                                ocfs2_clusters_to_blocks(inode->i_sb, coff);
        } else {
+                /*
+                 * Truncate completed, leave happily.
+                 */
                status = 0;
                goto bail;
        }
-        mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
+        phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
-             clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
-        if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
-                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
-                         OCFS2_HAS_REFCOUNT_FL));
-                status = ocfs2_lock_refcount_tree(osb,
-                                                le64_to_cpu(di->i_refcount_loc),
-                                                1, &ref_tree, NULL);
-                if (status) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
-                                                               blkno,
-                                                               clusters_to_del,
-                                                               &credits,
-                                                               &meta_ac);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        mutex_lock(&tl_inode->i_mutex);
-        tl_sem = 1;
-        /* ocfs2_truncate_log_needs_flush guarantees us at least one
-         * record is free for use. If there isn't any, we flush to get
-         * an empty truncate log.  */
-        if (ocfs2_truncate_log_needs_flush(osb)) {
-                status = __ocfs2_flush_truncate_log(osb);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
+        status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
-                                                (struct ocfs2_dinode *)fe_bh->b_data,
+                                          phys_cpos, trunc_len, flags, &dealloc,
-                                                el);
+                                          refcount_loc);
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                handle = NULL;
-                mlog_errno(status);
-                goto bail;
-        }
-        status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
-                                   tc, path, meta_ac);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        mutex_unlock(&tl_inode->i_mutex);
-        tl_sem = 0;
-        ocfs2_commit_trans(osb, handle);
-        handle = NULL;
        ocfs2_reinit_path(path, 1);
-        if (meta_ac) {
-                ocfs2_free_alloc_context(meta_ac);
-                meta_ac = NULL;
-        }
-        if (ref_tree) {
-                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-                ref_tree = NULL;
-        }
        /*
         * The check above will catch the case where we've truncated
         * away all allocation.
@@ -7585,25 +7130,10 @@ bail:
        ocfs2_schedule_truncate_log_flush(osb, 1);
-        if (tl_sem)
+        ocfs2_run_deallocs(osb, &dealloc);
-                mutex_unlock(&tl_inode->i_mutex);
-        if (handle)
-                ocfs2_commit_trans(osb, handle);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
-        if (ref_tree)
-                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-        ocfs2_run_deallocs(osb, &tc->tc_dealloc);
        ocfs2_free_path(path);
-        /* This will drop the ext_alloc cluster lock for us */
-        ocfs2_free_truncate_context(tc);
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 1db4359ccb90..55762b554b99 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
                        struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_remove_btree_range(struct inode *inode,
                             struct ocfs2_extent_tree *et,
-                             u32 cpos, u32 phys_cpos, u32 len,
+                             u32 cpos, u32 phys_cpos, u32 len, int flags,
-                             struct ocfs2_cached_dealloc_ctxt *dealloc);
+                             struct ocfs2_cached_dealloc_ctxt *dealloc,
+                             u64 refcount_loc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct ocfs2_extent_tree *et);
@@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
                                u64 blkno, unsigned int bit);
 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-                              int type, int slot, u64 blkno,
+                              int type, int slot, u64 suballoc, u64 blkno,
                              unsigned int bit);
 static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
 {
@@ -233,8 +234,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct ocfs2_truncate_context **tc);
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
-                          struct buffer_head *fe_bh,
+                          struct buffer_head *di_bh);
-                          struct ocfs2_truncate_context *tc);
 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
                          unsigned int start, unsigned int end, int trunc);
@@ -319,6 +319,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
                              struct ocfs2_path *path);
 int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
                                   struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+                                  struct ocfs2_path *path, u32 *cpos);
 int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
                            struct ocfs2_path *left,
                            struct ocfs2_path *right);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 21441ddb5506..3623ca20cc18 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1735,6 +1735,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                        goto out;
                }
+                if (data_ac)
+                        data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
                credits = ocfs2_calc_extend_credits(inode->i_sb,
                                                    &di->id2.i_list,
                                                    clusters_to_alloc);
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 21c808f752d8..f9d5d3ffc75a 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <cluster/masklog.h>
@@ -407,6 +406,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
                                struct buffer_head *bh)
 {
        int ret = 0;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
        mlog_entry_void();
@@ -426,6 +426,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
        get_bh(bh); /* for end_buffer_write_sync() */
        bh->b_end_io = end_buffer_write_sync;
+        ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
        submit_bh(WRITE, bh);
        wait_on_buffer(bh);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 5c9890006708..41d5f1f92d56 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,7 @@
 #include <linux/crc32.h>
 #include <linux/time.h>
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 #include "heartbeat.h"
 #include "tcp.h"
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index b39da877b12f..c7fba396392d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
+        define_mask(RESERVATIONS),
 };
 static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
@@ -136,7 +137,7 @@ static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
        return mlog_mask_store(mlog_attr->mask, buf, count);
 }
-static struct sysfs_ops mlog_attr_ops = {
+static const struct sysfs_ops mlog_attr_ops = {
        .show  = mlog_show,
        .store = mlog_store,
 };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 3dfddbec32f2..fd96e2a2fa56 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,6 +119,7 @@
 #define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
 #define ML_KTHREAD      0x0000000400000000ULL /* kernel thread activity */
+#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index c81142e3ef84..ed0c9f367fed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
 * Boston, MA 021110-1307, USA.
 */
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/configfs.h>
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 639024033fce..cf3e16696216 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
 * and if they're the last, they fire off the decision.
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/reboot.h>
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d8d0c65ac03c..aa75ca3f78da 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -72,9 +72,9 @@
 #include "tcp_internal.h"
-#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
+#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
 #define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num,    \
-                          NIPQUAD(sc->sc_node->nd_ipv4_address),        \
+                          &sc->sc_node->nd_ipv4_address,                \
                          ntohs(sc->sc_node->nd_ipv4_port)
 /*
@@ -583,6 +583,9 @@ static void o2net_state_change(struct sock *sk)
                        o2net_sc_queue_work(sc, &sc->sc_connect_work);
                        break;
                default:
+                        printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
+                              " shutdown, state %d\n",
+                              SC_NODEF_ARGS(sc), sk->sk_state);
                        o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
                        break;
        }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index efd77d071c80..f04ebcfffc4a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1194,7 +1194,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                        else
                                de->inode = 0;
                        dir->i_version++;
-                        status = ocfs2_journal_dirty(handle, bh);
+                        ocfs2_journal_dirty(handle, bh);
                        goto bail;
                }
                i += le16_to_cpu(de->rec_len);
@@ -1752,7 +1752,7 @@ int __ocfs2_add_entry(handle_t *handle,
                                ocfs2_recalc_free_list(dir, handle, lookup);
                        dir->i_version++;
-                        status = ocfs2_journal_dirty(handle, insert_bh);
+                        ocfs2_journal_dirty(handle, insert_bh);
                        retval = 0;
                        goto bail;
                }
@@ -2297,12 +2297,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
        }
        ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
        ocfs2_journal_dirty(handle, di_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        i_size_write(inode, size);
        inode->i_nlink = 2;
@@ -2366,11 +2361,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                ocfs2_init_dir_trailer(inode, new_bh, size);
        }
-        status = ocfs2_journal_dirty(handle, new_bh);
+        ocfs2_journal_dirty(handle, new_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        i_size_write(inode, inode->i_sb->s_blocksize);
        inode->i_nlink = 2;
@@ -2404,15 +2395,15 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        int ret;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
        u16 dr_suballoc_bit;
-        u64 dr_blkno;
+        u64 suballoc_loc, dr_blkno;
        unsigned int num_bits;
        struct buffer_head *dx_root_bh = NULL;
        struct ocfs2_dx_root_block *dx_root;
        struct ocfs2_dir_block_trailer *trailer =
                ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
-        ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
-                                   &num_bits, &dr_blkno);
+                                   &dr_suballoc_bit, &num_bits, &dr_blkno);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2440,6 +2431,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        memset(dx_root, 0, osb->sb->s_blocksize);
        strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
        dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
        dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
        dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
        dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2458,10 +2450,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
                dx_root->dr_list.l_count =
                        cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
        }
+        ocfs2_journal_dirty(handle, dx_root_bh);
-        ret = ocfs2_journal_dirty(handle, dx_root_bh);
-        if (ret)
-                mlog_errno(ret);
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
                                      OCFS2_JOURNAL_ACCESS_CREATE);
@@ -2475,9 +2464,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
-        ret = ocfs2_journal_dirty(handle, di_bh);
+        ocfs2_journal_dirty(handle, di_bh);
-        if (ret)
-                mlog_errno(ret);
        *ret_dx_root_bh = dx_root_bh;
        dx_root_bh = NULL;
@@ -2558,7 +2545,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
         * chance of contiguousness as the directory grows in number
         * of entries.
         */
-        ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
+        ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2991,7 +2978,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * if we only get one now, that's enough to continue. The rest
         * will be claimed after the conversion to extents.
         */
-        ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+        if (ocfs2_dir_resv_allowed(osb))
+                data_ac->ac_resv = &oi->ip_la_data_resv;
+        ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -3034,11 +3023,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                ocfs2_init_dir_trailer(dir, dirdata_bh, i);
        }
-        ret = ocfs2_journal_dirty(handle, dirdata_bh);
+        ocfs2_journal_dirty(handle, dirdata_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
                /*
@@ -3104,11 +3089,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         */
        dir->i_blocks = ocfs2_inode_sector_count(dir);
-        ret = ocfs2_journal_dirty(handle, di_bh);
+        ocfs2_journal_dirty(handle, di_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        if (ocfs2_supports_indexed_dirs(osb)) {
                ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
@@ -3138,7 +3119,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * pass. Claim the 2nd cluster as a separate extent.
         */
        if (alloc > len) {
-                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+                ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
                                           &len);
                if (ret) {
                        mlog_errno(ret);
@@ -3369,6 +3350,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
                        goto bail;
                }
+                if (ocfs2_dir_resv_allowed(osb))
+                        data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
                credits = ocfs2_calc_extend_credits(sb, el, 1);
        } else {
                spin_unlock(&OCFS2_I(dir)->ip_lock);
@@ -3423,11 +3407,7 @@ do_extend:
        } else {
                de->rec_len = cpu_to_le16(sb->s_blocksize);
        }
-        status = ocfs2_journal_dirty(handle, new_bh);
+        ocfs2_journal_dirty(handle, new_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        dir_i_size += dir->i_sb->s_blocksize;
        i_size_write(dir, dir_i_size);
@@ -3906,11 +3886,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
             sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
             dx_leaf_sort_swap);
-        ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
+        ocfs2_journal_dirty(handle, dx_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
                                           &split_hash);
@@ -4490,7 +4466,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
        blk = le64_to_cpu(dx_root->dr_blkno);
        bit = le16_to_cpu(dx_root->dr_suballoc_bit);
-        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        if (dx_root->dr_suballoc_loc)
+                bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
+        else
+                bg_blkno = ocfs2_which_suballoc_group(blk, bit);
        ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
                                       bit, bg_blkno, 1);
        if (ret)
@@ -4551,8 +4530,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
                p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
-                ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
+                ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
-                                               &dealloc);
+                                               &dealloc, 0);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index dccc439fa087..f44999156839 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -89,7 +88,7 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
        return 0;
 }
-static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
        mlog_entry_void();
@@ -146,7 +145,7 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 }
-static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
        mlog_entry_void();
@@ -185,9 +184,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        BUG_ON(!lksb);
        /* only updates if this node masters the lockres */
+        spin_lock(&res->spinlock);
        if (res->owner == dlm->node_num) {
-                spin_lock(&res->spinlock);
                /* check the lksb flags for the direction */
                if (lksb->flags & DLM_LKSB_GET_LVB) {
                        mlog(0, "getting lvb from lockres for %s node\n",
@@ -202,8 +200,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                 * here. In the future we might want to clear it at the time
                 * the put is actually done.
                 */
-                spin_unlock(&res->spinlock);
        }
+        spin_unlock(&res->spinlock);
        /* reset any lvb flags on the lksb */
        lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
@@ -453,7 +451,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
                                     lock->ml.node, &status);
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+                     lock->ml.node);
        else {
                if (status == DLM_RECOVERING) {
                        mlog(ML_ERROR, "sent AST to node %u, it thinks this "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0102be35980c..4b6ae2c13b47 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,7 @@
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
-#define DLM_HASH_SIZE_DEFAULT   (1 << 14)
+#define DLM_HASH_SIZE_DEFAULT   (1 << 17)
 #if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
 # define DLM_HASH_PAGES         1
 #else
@@ -904,6 +904,8 @@ void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_do_local_ast(struct dlm_ctxt *dlm,
                      struct dlm_lock_resource *res,
                      struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index f283bce776b4..9f30491e5e88 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -391,7 +390,9 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
                } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
                        dlm_error(ret);
        } else {
-                mlog_errno(tmpret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
+                     res->owner);
                if (dlm_is_host_down(tmpret)) {
                        /* instead of logging the same network error over
                         * and over, sleep here and wait for the heartbeat
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 988c9055fd4e..6b5a492e1749 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -511,7 +511,7 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
        assert_spin_locked(&dlm->spinlock);
-        printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
+        printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
        while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
                                     node + 1)) < O2NM_MAX_NODES) {
@@ -534,7 +534,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
        node = exit_msg->node_idx;
-        printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
+        printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
        spin_lock(&dlm->spinlock);
        clear_bit(node, dlm->domain_map);
@@ -565,7 +565,9 @@ static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
        status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
                                    &leave_msg, sizeof(leave_msg), node,
                                    NULL);
+        if (status < 0)
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
        mlog(0, "status return %d from o2net_send_message\n", status);
        return status;
@@ -904,7 +906,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
                set_bit(assert->node_idx, dlm->domain_map);
                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
-                printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+                printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
                       assert->node_idx, dlm->name);
                __dlm_print_nodes(dlm);
@@ -962,7 +964,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
                                    &cancel_msg, sizeof(cancel_msg), node,
                                    NULL);
        if (status < 0) {
-                mlog_errno(status);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+                     node);
                goto bail;
        }
@@ -1029,10 +1033,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
        byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
        status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
-                                    sizeof(join_msg), node,
+                                    sizeof(join_msg), node, &join_resp);
-                                    &join_resp);
        if (status < 0 && status != -ENOPROTOOPT) {
-                mlog_errno(status);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+                     node);
                goto bail;
        }
        dlm_query_join_wire_to_packet(join_resp, &packet);
@@ -1103,7 +1108,9 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
                                    &assert_msg, sizeof(assert_msg), node,
                                    NULL);
        if (status < 0)
-                mlog_errno(status);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+                     node);
        return status;
 }
@@ -1516,7 +1523,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
                goto leave;
        }
-        dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+        dlm->name = kstrdup(domain, GFP_KERNEL);
        if (dlm->name == NULL) {
                mlog_errno(-ENOMEM);
                kfree(dlm);
@@ -1550,7 +1557,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        for (i = 0; i < DLM_HASH_BUCKETS; i++)
                INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
-        strcpy(dlm->name, domain);
        dlm->key = key;
        dlm->node_num = o2nm_this_node();
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 733337772671..69cf369961c4 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -329,7 +329,9 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
                        BUG();
                }
        } else {
-                mlog_errno(tmpret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
+                     res->owner);
                if (dlm_is_host_down(tmpret)) {
                        ret = DLM_RECOVERING;
                        mlog(0, "node %u died so returning DLM_RECOVERING "
@@ -429,7 +431,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
        struct dlm_lock *lock;
        int kernel_allocated = 0;
-        lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
+        lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
        if (!lock)
                return NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a659606dcb95..4a7506a4e314 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -617,13 +617,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 {
        struct dlm_lock_resource *res = NULL;
-        res = (struct dlm_lock_resource *)
+        res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
-                                kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
        if (!res)
                goto error;
-        res->lockname.name = (char *)
+        res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
-                                kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
        if (!res->lockname.name)
                goto error;
@@ -757,8 +755,7 @@ lookup:
                spin_unlock(&dlm->spinlock);
                mlog(0, "allocating a new resource\n");
                /* nothing found and we need to allocate one. */
-                alloc_mle = (struct dlm_master_list_entry *)
+                alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                        kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                if (!alloc_mle)
                        goto leave;
                res = dlm_new_lockres(dlm, lockid, namelen);
@@ -1542,8 +1539,7 @@ way_up_top:
                        spin_unlock(&dlm->master_lock);
                        spin_unlock(&dlm->spinlock);
-                        mle = (struct dlm_master_list_entry *)
+                        mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                                kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                        if (!mle) {
                                response = DLM_MASTER_RESP_ERROR;
                                mlog_errno(-ENOMEM);
@@ -1666,7 +1662,9 @@ again:
                tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
                                            &assert, sizeof(assert), to, &r);
                if (tmpret < 0) {
-                        mlog(0, "assert_master returned %d!\n", tmpret);
+                        mlog(ML_ERROR, "Error %d when sending message %u (key "
+                             "0x%x) to node %u\n", tmpret,
+                             DLM_ASSERT_MASTER_MSG, dlm->key, to);
                        if (!dlm_is_host_down(tmpret)) {
                                mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
                                BUG();
@@ -1875,7 +1873,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
 ok:
                spin_unlock(&res->spinlock);
        }
-        spin_unlock(&dlm->spinlock);
        // mlog(0, "woo!  got an assert_master from node %u!\n",
        //           assert->node_idx);
@@ -1926,7 +1923,6 @@ ok:
                /* master is known, detach if not already detached.
                 * ensures that only one assert_master call will happen
                 * on this mle. */
-                spin_lock(&dlm->spinlock);
                spin_lock(&dlm->master_lock);
                rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1955,6 @@ ok:
                        __dlm_put_mle(mle);
                }
                spin_unlock(&dlm->master_lock);
-                spin_unlock(&dlm->spinlock);
        } else if (res) {
                if (res->owner != assert->node_idx) {
                        mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1962,7 @@ ok:
                             res->owner, namelen, name);
                }
        }
+        spin_unlock(&dlm->spinlock);
 done:
        ret = 0;
@@ -2207,7 +2203,9 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
                                 &deref, sizeof(deref), res->owner, &r);
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
+                     res->owner);
        else if (r < 0) {
                /* BAD.  other node says I did not have a ref. */
                mlog(ML_ERROR,"while dropping ref on %s:%.*s "
@@ -2454,8 +2452,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
                goto leave;
        }
-        mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+        mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                                                                GFP_NOFS);
        if (!mle) {
                mlog_errno(ret);
                goto leave;
@@ -2977,7 +2974,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                                         &migrate, sizeof(migrate), nodenum,
                                         &status);
                if (ret < 0) {
-                        mlog(0, "migrate_request returned %d!\n", ret);
+                        mlog(ML_ERROR, "Error %d when sending message %u (key "
+                             "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
+                             dlm->key, nodenum);
                        if (!dlm_is_host_down(ret)) {
                                mlog(ML_ERROR, "unhandled error=%d!\n", ret);
                                BUG();
@@ -3035,8 +3034,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
        hash = dlm_lockid_hash(name, namelen);
        /* preallocate.. if this fails, abort */
-        mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+        mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                                                         GFP_NOFS);
        if (!mle) {
                ret = -ENOMEM;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b4f99de2caf3..f8b75ce4be70 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -803,7 +803,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
        /* negative status is handled by caller */
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
+                     dlm->key, request_from);
        // return from here, then
        // sleep until all received or error
@@ -955,10 +957,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
        ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
                                 sizeof(done_msg), send_to, &tmpret);
        if (ret < 0) {
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
+                     dlm->key, send_to);
                if (!dlm_is_host_down(ret)) {
-                        mlog_errno(ret);
-                        mlog(ML_ERROR, "%s: unknown error sending data-done "
-                             "to %u\n", dlm->name, send_to);
                        BUG();
                }
        } else
@@ -1126,7 +1128,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
        if (ret < 0) {
                /* XXX: negative status is not handled.
                 * this will end up killing this node. */
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
+                     dlm->key, send_to);
        } else {
                /* might get an -ENOMEM back here */
                ret = status;
@@ -1642,7 +1646,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                                 &req, sizeof(req), nodenum, &status);
        /* XXX: negative status not handled properly here. */
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
+                     dlm->key, nodenum);
        else {
                BUG_ON(status < 0);
                BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -2640,7 +2646,7 @@ retry:
                if (dlm_is_host_down(ret)) {
                        /* node is down.  not involved in recovery
                         * so just keep going */
-                        mlog(0, "%s: node %u was down when sending "
+                        mlog(ML_NOTICE, "%s: node %u was down when sending "
                             "begin reco msg (%d)\n", dlm->name, nodenum, ret);
                        ret = 0;
                }
@@ -2660,11 +2666,12 @@ retry:
                }
                if (ret < 0) {
                        struct dlm_lock_resource *res;
                        /* this is now a serious problem, possibly ENOMEM
                         * in the network stack.  must retry */
                        mlog_errno(ret);
                        mlog(ML_ERROR, "begin reco of dlm %s to node %u "
-                            " returned %d\n", dlm->name, nodenum, ret);
+                             "returned %d\n", dlm->name, nodenum, ret);
                        res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
                                                 DLM_RECOVERY_LOCK_NAME_LEN);
                        if (res) {
@@ -2789,7 +2796,9 @@ stage2:
                if (ret >= 0)
                        ret = status;
                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(ML_ERROR, "Error %d when sending message %u (key "
+                             "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
+                             dlm->key, nodenum);
                        if (dlm_is_host_down(ret)) {
                                /* this has no effect on this recovery
                                 * session, so set the status to zero to
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78b..d4f73ca68fe5 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -310,6 +309,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
         * spinlock, and because we know that it is not migrating/
         * recovering/in-progress, it is fine to reserve asts and
         * basts right before queueing them all throughout */
+        assert_spin_locked(&dlm->ast_lock);
        assert_spin_locked(&res->spinlock);
        BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
                              DLM_LOCK_RES_RECOVERING|
@@ -338,7 +338,7 @@ converting:
                        /* queue the BAST if not already */
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        /* update the highest_blocked if needed */
                        if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -356,7 +356,7 @@ converting:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.convert_type)
                                lock->ml.highest_blocked =
@@ -384,7 +384,7 @@ converting:
                spin_unlock(&target->spinlock);
                __dlm_lockres_reserve_ast(res);
-                dlm_queue_ast(dlm, target);
+                __dlm_queue_ast(dlm, target);
                /* go back and check for more */
                goto converting;
        }
@@ -403,7 +403,7 @@ blocked:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.type)
                                lock->ml.highest_blocked = target->ml.type;
@@ -419,7 +419,7 @@ blocked:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.type)
                                lock->ml.highest_blocked = target->ml.type;
@@ -445,7 +445,7 @@ blocked:
                spin_unlock(&target->spinlock);
                __dlm_lockres_reserve_ast(res);
-                dlm_queue_ast(dlm, target);
+                __dlm_queue_ast(dlm, target);
                /* go back and check for more */
                goto converting;
        }
@@ -675,6 +675,7 @@ static int dlm_thread(void *data)
                        /* lockres can be re-dirtied/re-added to the
                         * dirty_list in this gap, but that is ok */
+                        spin_lock(&dlm->ast_lock);
                        spin_lock(&res->spinlock);
                        if (res->owner != dlm->node_num) {
                                __dlm_print_one_lock_resource(res);
@@ -695,6 +696,7 @@ static int dlm_thread(void *data)
                                /* move it to the tail and keep going */
                                res->state &= ~DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);
+                                spin_unlock(&dlm->ast_lock);
                                mlog(0, "delaying list shuffling for in-"
                                     "progress lockres %.*s, state=%d\n",
                                     res->lockname.len, res->lockname.name,
@@ -716,6 +718,7 @@ static int dlm_thread(void *data)
                        dlm_shuffle_lists(dlm, res);
                        res->state &= ~DLM_LOCK_RES_DIRTY;
                        spin_unlock(&res->spinlock);
+                        spin_unlock(&dlm->ast_lock);
                        dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 49e29ecd0201..817287c6a6db 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -355,7 +354,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
                        mlog(0, "master was in-progress.  retry\n");
                ret = status;
        } else {
-                mlog_errno(tmpret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
                if (dlm_is_host_down(tmpret)) {
                        /* NOTE: this seems strange, but it is what we want.
                         * when the master goes down during a cancel or
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 1b0de157a08c..b83d6107a1f5 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -112,20 +112,20 @@ MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
 * O_RDONLY -> PRMODE level
 * O_WRONLY -> EXMODE level
 *
- * O_NONBLOCK -> LKM_NOQUEUE
+ * O_NONBLOCK -> NOQUEUE
 */
 static int dlmfs_decode_open_flags(int open_flags,
                                   int *level,
                                   int *flags)
 {
        if (open_flags & (O_WRONLY|O_RDWR))
-                *level = LKM_EXMODE;
+                *level = DLM_LOCK_EX;
        else
-                *level = LKM_PRMODE;
+                *level = DLM_LOCK_PR;
        *flags = 0;
        if (open_flags & O_NONBLOCK)
-                *flags |= LKM_NOQUEUE;
+                *flags |= DLM_LKF_NOQUEUE;
        return 0;
 }
@@ -166,7 +166,7 @@ static int dlmfs_file_open(struct inode *inode,
                 * to be able userspace to be able to distinguish a
                 * valid lock request from one that simply couldn't be
                 * granted. */
-                if (flags & LKM_NOQUEUE && status == -EAGAIN)
+                if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
                        status = -ETXTBSY;
                kfree(fp);
                goto bail;
@@ -193,7 +193,7 @@ static int dlmfs_file_release(struct inode *inode,
        status = 0;
        if (fp) {
                level = fp->fp_lock_level;
-                if (level != LKM_IVMODE)
+                if (level != DLM_LOCK_IV)
                        user_dlm_cluster_unlock(&ip->ip_lockres, level);
                kfree(fp);
@@ -262,7 +262,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
        if ((count + *ppos) > i_size_read(inode))
                readlen = i_size_read(inode) - *ppos;
        else
-                readlen = count - *ppos;
+                readlen = count;
        lvb_buf = kmalloc(readlen, GFP_NOFS);
        if (!lvb_buf)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 8298608d4165..50c4ee805da4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1881,7 +1881,7 @@ out:
 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
 * flock() calls. The locking approach this requires is sufficiently
 * different from all other cluster lock types that we implement a
- * seperate path to the "low-level" dlm calls. In particular:
+ * separate path to the "low-level" dlm calls. In particular:
 *
 * - No optimization of lock levels is done - we take at exactly
 *   what's been requested.
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 5328529e7fd2..09e3fdfa6d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,6 +24,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/fiemap.h>
@@ -453,7 +454,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
        if (i == -1) {
                /*
                 * Holes can be larger than the maximum size of an
-                 * extent, so we return their lengths in a seperate
+                 * extent, so we return their lengths in a separate
                 * field.
                 */
                if (hole_len) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 17947dc8341e..f74f1400eccd 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -278,10 +278,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
        inode->i_atime = CURRENT_TIME;
        di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
        di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+        ocfs2_journal_dirty(handle, bh);
-        ret = ocfs2_journal_dirty(handle, bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out_commit:
        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -430,9 +427,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
-        status = ocfs2_journal_dirty(handle, fe_bh);
+        ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0)
-                mlog_errno(status);
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -449,7 +444,6 @@ static int ocfs2_truncate_file(struct inode *inode,
        int status = 0;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_truncate_context *tc = NULL;
        mlog_entry("(inode = %llu, new_i_size = %llu\n",
                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -488,6 +482,9 @@ static int ocfs2_truncate_file(struct inode *inode,
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ocfs2_resv_discard(&osb->osb_la_resmap,
+                           &OCFS2_I(inode)->ip_la_data_resv);
        /*
         * The inode lock forced other nodes to sync and drop their
         * pages, which (correctly) happens even if we have a truncate
@@ -517,13 +514,7 @@ static int ocfs2_truncate_file(struct inode *inode,
                goto bail_unlock_sem;
        }
-        status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
+        status = ocfs2_commit_truncate(osb, inode, di_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail_unlock_sem;
-        }
-        status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
        if (status < 0) {
                mlog_errno(status);
                goto bail_unlock_sem;
@@ -666,11 +657,7 @@ restarted_transaction:
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        spin_lock(&OCFS2_I(inode)->ip_lock);
        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
@@ -684,6 +671,7 @@ restarted_transaction:
                if (why == RESTART_META) {
                        mlog(0, "restarting function.\n");
                        restart_func = 1;
+                        status = 0;
                } else {
                        BUG_ON(why != RESTART_TRANS);
@@ -1194,9 +1182,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
        di = (struct ocfs2_dinode *) bh->b_data;
        di->i_mode = cpu_to_le16(inode->i_mode);
-        ret = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out_trans:
        ocfs2_commit_trans(osb, handle);
@@ -1433,16 +1419,90 @@ out:
        return ret;
 }
+static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
+{
+        int i;
+        struct ocfs2_extent_rec *rec = NULL;
+        for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+                rec = &el->l_recs[i];
+                if (le32_to_cpu(rec->e_cpos) < pos)
+                        break;
+        }
+        return i;
+}
+/*
+ * Helper to calculate the punching pos and length in one run, we handle the
+ * following three cases in order:
+ *
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (hole-punching completed)
+*/
+static void ocfs2_calc_trunc_pos(struct inode *inode,
+                                 struct ocfs2_extent_list *el,
+                                 struct ocfs2_extent_rec *rec,
+                                 u32 trunc_start, u32 *trunc_cpos,
+                                 u32 *trunc_len, u32 *trunc_end,
+                                 u64 *blkno, int *done)
+{
+        int ret = 0;
+        u32 coff, range;
+        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+        if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+                *trunc_cpos = le32_to_cpu(rec->e_cpos);
+                /*
+                 * Skip holes if any.
+                 */
+                if (range < *trunc_end)
+                        *trunc_end = range;
+                *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
+                *blkno = le64_to_cpu(rec->e_blkno);
+                *trunc_end = le32_to_cpu(rec->e_cpos);
+        } else if (range > trunc_start) {
+                *trunc_cpos = trunc_start;
+                *trunc_len = *trunc_end - trunc_start;
+                coff = trunc_start - le32_to_cpu(rec->e_cpos);
+                *blkno = le64_to_cpu(rec->e_blkno) +
+                                ocfs2_clusters_to_blocks(inode->i_sb, coff);
+                *trunc_end = trunc_start;
+        } else {
+                /*
+                 * It may have two following possibilities:
+                 *
+                 * - last record has been removed
+                 * - trunc_start was within a hole
+                 *
+                 * both two cases mean the completion of hole punching.
+                 */
+                ret = 1;
+        }
+        *done = ret;
+}
 static int ocfs2_remove_inode_range(struct inode *inode,
                                    struct buffer_head *di_bh, u64 byte_start,
                                    u64 byte_len)
 {
-        int ret = 0;
+        int ret = 0, flags = 0, done = 0, i;
-        u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
+        u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
+        u32 cluster_in_el;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_cached_dealloc_ctxt dealloc;
        struct address_space *mapping = inode->i_mapping;
        struct ocfs2_extent_tree et;
+        struct ocfs2_path *path = NULL;
+        struct ocfs2_extent_list *el = NULL;
+        struct ocfs2_extent_rec *rec = NULL;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
        ocfs2_init_dealloc_ctxt(&dealloc);
@@ -1468,17 +1528,35 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                goto out;
        }
+        /*
+         * For reflinks, we may need to CoW 2 clusters which might be
+         * partially zero'd later, if hole's start and end offset were
+         * within one cluster(means is not exactly aligned to clustersize).
+         */
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
-        trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
+        trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
-        if (trunc_len >= trunc_start)
+        cluster_in_el = trunc_end;
-                trunc_len -= trunc_start;
-        else
-                trunc_len = 0;
-        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
+        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
             (unsigned long long)byte_start,
-             (unsigned long long)byte_len, trunc_start, trunc_len);
+             (unsigned long long)byte_len, trunc_start, trunc_end);
        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
        if (ret) {
@@ -1486,31 +1564,79 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                goto out;
        }
-        cpos = trunc_start;
+        path = ocfs2_new_path_from_et(&et);
-        while (trunc_len) {
+        if (!path) {
-                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+                ret = -ENOMEM;
-                                         &alloc_size, NULL);
+                mlog_errno(ret);
+                goto out;
+        }
+        while (trunc_end > trunc_start) {
+                ret = ocfs2_find_path(INODE_CACHE(inode), path,
+                                      cluster_in_el);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                if (alloc_size > trunc_len)
+                el = path_leaf_el(path);
-                        alloc_size = trunc_len;
-                /* Only do work for non-holes */
+                i = ocfs2_find_rec(el, trunc_end);
-                if (phys_cpos != 0) {
+                /*
-                        ret = ocfs2_remove_btree_range(inode, &et, cpos,
+                 * Need to go to previous extent block.
-                                                       phys_cpos, alloc_size,
+                 */
-                                                       &dealloc);
+                if (i < 0) {
+                        if (path->p_tree_depth == 0)
+                                break;
+                        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+                                                            path,
+                                                            &cluster_in_el);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
+                        /*
+                         * We've reached the leftmost extent block,
+                         * it's safe to leave.
+                         */
+                        if (cluster_in_el == 0)
+                                break;
+                        /*
+                         * The 'pos' searched for previous extent block is
+                         * always one cluster less than actual trunc_end.
+                         */
+                        trunc_end = cluster_in_el + 1;
+                        ocfs2_reinit_path(path, 1);
+                        continue;
+                } else
+                        rec = &el->l_recs[i];
+                ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
+                                     &trunc_len, &trunc_end, &blkno, &done);
+                if (done)
+                        break;
+                flags = rec->e_flags;
+                phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+                ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+                                               phys_cpos, trunc_len, flags,
+                                               &dealloc, refcount_loc);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
                }
-                cpos += alloc_size;
+                cluster_in_el = trunc_end;
-                trunc_len -= alloc_size;
+                ocfs2_reinit_path(path, 1);
        }
        ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
@@ -1981,18 +2107,18 @@ relock:
        /* communicate with ocfs2_dio_end_io */
        ocfs2_iocb_set_rw_locked(iocb, rw_level);
-        if (direct_io) {
+        ret = generic_segment_checks(iov, &nr_segs, &ocount,
-                ret = generic_segment_checks(iov, &nr_segs, &ocount,
+                                     VERIFY_READ);
-                                             VERIFY_READ);
+        if (ret)
-                if (ret)
+                goto out_dio;
-                        goto out_dio;
-                count = ocount;
+        count = ocount;
-                ret = generic_write_checks(file, ppos, &count,
+        ret = generic_write_checks(file, ppos, &count,
-                                           S_ISBLK(inode->i_mode));
+                                   S_ISBLK(inode->i_mode));
-                if (ret)
+        if (ret)
-                        goto out_dio;
+                goto out_dio;
+        if (direct_io) {
                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
                                                    ppos, count, ocount);
                if (written < 0) {
@@ -2007,7 +2133,10 @@ relock:
                        goto out_dio;
                }
        } else {
-                written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
+                current->backing_dev_info = file->f_mapping->backing_dev_info;
+                written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
+                                                      ppos, count, 0);
+                current->backing_dev_info = NULL;
        }
 out_dio:
@@ -2021,9 +2150,9 @@ out_dio:
                if (ret < 0)
                        written = ret;
-                if (!ret && (old_size != i_size_read(inode) ||
+                if (!ret && ((old_size != i_size_read(inode)) ||
-                    old_clusters != OCFS2_I(inode)->ip_clusters ||
+                             (old_clusters != OCFS2_I(inode)->ip_clusters) ||
-                    has_refcount)) {
+                             has_refcount)) {
                        ret = jbd2_journal_force_commit(osb->journal->j_journal);
                        if (ret < 0)
                                written = ret;
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db868..1aa863dd901f 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #define MLOG_MASK_PREFIX ML_SUPER
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 278a223aae14..abb0a95cc717 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,7 +25,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
@@ -377,6 +376,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        OCFS2_I(inode)->ip_last_used_slot = 0;
        OCFS2_I(inode)->ip_last_used_group = 0;
+        if (S_ISDIR(inode->i_mode))
+                ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
+                                    OCFS2_RESV_FLAG_DIR);
        mlog_exit_void();
 }
@@ -540,7 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                                     struct buffer_head *fe_bh)
 {
        int status = 0;
-        struct ocfs2_truncate_context *tc = NULL;
        struct ocfs2_dinode *fe;
        handle_t *handle = NULL;
@@ -559,6 +561,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
                if (IS_ERR(handle)) {
                        status = PTR_ERR(handle);
+                        handle = NULL;
                        mlog_errno(status);
                        goto out;
                }
@@ -582,13 +585,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                ocfs2_commit_trans(osb, handle);
                handle = NULL;
-                status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+                status = ocfs2_commit_truncate(osb, inode, fe_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto out;
-                }
-                status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
                if (status < 0) {
                        mlog_errno(status);
                        goto out;
@@ -640,11 +637,13 @@ static int ocfs2_remove_inode(struct inode *inode,
                goto bail_unlock;
        }
-        status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+        if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
-                                  orphan_dir_bh);
+                status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
-        if (status < 0) {
+                                          orphan_dir_bh);
-                mlog_errno(status);
+                if (status < 0) {
-                goto bail_commit;
+                        mlog_errno(status);
+                        goto bail_commit;
+                }
        }
        /* set the inodes dtime */
@@ -657,12 +656,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
        di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+        ocfs2_journal_dirty(handle, di_bh);
-        status = ocfs2_journal_dirty(handle, di_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail_commit;
-        }
        ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
        dquot_free_inode(inode);
@@ -723,38 +717,39 @@ static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
 static int ocfs2_wipe_inode(struct inode *inode,
                            struct buffer_head *di_bh)
 {
-        int status, orphaned_slot;
+        int status, orphaned_slot = -1;
        struct inode *orphan_dir_inode = NULL;
        struct buffer_head *orphan_dir_bh = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_dinode *di;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
-        di = (struct ocfs2_dinode *) di_bh->b_data;
+        if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
-        orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
+                orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
-        status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
+                status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
-        if (status)
+                if (status)
-                return status;
+                        return status;
-        orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+                orphan_dir_inode = ocfs2_get_system_file_inode(osb,
-                                                       ORPHAN_DIR_SYSTEM_INODE,
+                                                               ORPHAN_DIR_SYSTEM_INODE,
-                                                       orphaned_slot);
+                                                               orphaned_slot);
-        if (!orphan_dir_inode) {
+                if (!orphan_dir_inode) {
-                status = -EEXIST;
+                        status = -EEXIST;
-                mlog_errno(status);
+                        mlog_errno(status);
-                goto bail;
+                        goto bail;
-        }
+                }
-        /* Lock the orphan dir. The lock will be held for the entire
+                /* Lock the orphan dir. The lock will be held for the entire
-         * delete_inode operation. We do this now to avoid races with
+                 * delete_inode operation. We do this now to avoid races with
-         * recovery completion on other nodes. */
+                 * recovery completion on other nodes. */
-        mutex_lock(&orphan_dir_inode->i_mutex);
+                mutex_lock(&orphan_dir_inode->i_mutex);
-        status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+                status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
-        if (status < 0) {
+                if (status < 0) {
-                mutex_unlock(&orphan_dir_inode->i_mutex);
+                        mutex_unlock(&orphan_dir_inode->i_mutex);
-                mlog_errno(status);
+                        mlog_errno(status);
-                goto bail;
+                        goto bail;
+                }
        }
        /* we do this while holding the orphan dir lock because we
@@ -795,6 +790,9 @@ static int ocfs2_wipe_inode(struct inode *inode,
                mlog_errno(status);
 bail_unlock_dir:
+        if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)
+                return status;
        ocfs2_inode_unlock(orphan_dir_inode, 1);
        mutex_unlock(&orphan_dir_inode->i_mutex);
        brelse(orphan_dir_bh);
@@ -890,7 +888,23 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
        /* Do some basic inode verification... */
        di = (struct ocfs2_dinode *) di_bh->b_data;
-        if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+        if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) &&
+            !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+                /*
+                 * Inodes in the orphan dir must have ORPHANED_FL.  The only
+                 * inodes that come back out of the orphan dir are reflink
+                 * targets. A reflink target may be moved out of the orphan
+                 * dir between the time we scan the directory and the time we
+                 * process it. This would lead to HAS_REFCOUNT_FL being set but
+                 * ORPHANED_FL not.
+                 */
+                if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
+                        mlog(0, "Reflinked inode %llu is no longer orphaned.  "
+                             "it shouldn't be deleted\n",
+                             (unsigned long long)oi->ip_blkno);
+                        goto bail;
+                }
                /* for lack of a better error? */
                status = -EEXIST;
                mlog(ML_ERROR,
@@ -958,7 +972,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
 void ocfs2_delete_inode(struct inode *inode)
 {
        int wipe, status;
-        sigset_t blocked, oldset;
+        sigset_t oldset;
        struct buffer_head *di_bh = NULL;
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -985,13 +999,7 @@ void ocfs2_delete_inode(struct inode *inode)
         * messaging paths may return us -ERESTARTSYS. Which would
         * cause us to exit early, resulting in inodes being orphaned
         * forever. */
-        sigfillset(&blocked);
+        ocfs2_block_signals(&oldset);
-        status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-        if (status < 0) {
-                mlog_errno(status);
-                ocfs2_cleanup_delete_inode(inode, 1);
-                goto bail;
-        }
        /*
         * Synchronize us against ocfs2_get_dentry. We take this in
@@ -1065,9 +1073,7 @@ bail_unlock_nfs_sync:
        ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
 bail_unblock:
-        status = sigprocmask(SIG_SETMASK, &oldset, NULL);
+        ocfs2_unblock_signals(&oldset);
-        if (status < 0)
-                mlog_errno(status);
 bail:
        clear_inode(inode);
        mlog_exit_void();
@@ -1101,6 +1107,10 @@ void ocfs2_clear_inode(struct inode *inode)
        ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
+        ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+                           &oi->ip_la_data_resv);
+        ocfs2_resv_init_once(&oi->ip_la_data_resv);
        /* We very well may get a clear_inode before all an inodes
         * metadata has hit disk. Of course, we can't drop any cluster
         * locks until the journal has finished with it. The only
@@ -1276,13 +1286,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
        fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0)
-                mlog_errno(status);
-        status = 0;
 leave:
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ba4fe07b293c..9f5f5fcadc45 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -70,6 +70,8 @@ struct ocfs2_inode_info
        /* Only valid if the inode is the dir. */
        u32                             ip_last_used_slot;
        u64                             ip_last_used_group;
+        struct ocfs2_alloc_reservation  ip_la_data_resv;
 };
 /*
@@ -100,6 +102,8 @@ struct ocfs2_inode_info
 #define OCFS2_INODE_MAYBE_ORPHANED      0x00000020
 /* Does someone have the file open O_DIRECT */
 #define OCFS2_INODE_OPEN_DIRECT         0x00000040
+/* Tell the inode wipe code it's not in orphan dir */
+#define OCFS2_INODE_SKIP_ORPHAN_DIR     0x00000080
 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9336c60e3a36..47878cf16418 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -402,9 +402,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 }
 /*
- * 'nblocks' is what you want to add to the current
+ * 'nblocks' is what you want to add to the current transaction.
- * transaction. extend_trans will either extend the current handle by
- * nblocks, or commit it and start a new one with nblocks credits.
 *
 * This might call jbd2_journal_restart() which will commit dirty buffers
 * and then restart the transaction. Before calling
@@ -422,11 +420,15 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 */
 int ocfs2_extend_trans(handle_t *handle, int nblocks)
 {
-        int status;
+        int status, old_nblocks;
        BUG_ON(!handle);
-        BUG_ON(!nblocks);
+        BUG_ON(nblocks < 0);
+        if (!nblocks)
+                return 0;
+        old_nblocks = handle->h_buffer_credits;
        mlog_entry_void();
        mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
@@ -445,7 +447,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
                mlog(0,
                     "jbd2_journal_extend failed, trying "
                     "jbd2_journal_restart\n");
-                status = jbd2_journal_restart(handle, nblocks);
+                status = jbd2_journal_restart(handle,
+                                              old_nblocks + nblocks);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -734,8 +737,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
        return __ocfs2_journal_access(handle, ci, bh, NULL, type);
 }
-int ocfs2_journal_dirty(handle_t *handle,
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
-                        struct buffer_head *bh)
 {
        int status;
@@ -743,13 +745,9 @@ int ocfs2_journal_dirty(handle_t *handle,
                   (unsigned long long)bh->b_blocknr);
        status = jbd2_journal_dirty_metadata(handle, bh);
-        if (status < 0)
+        BUG_ON(status);
-                mlog(ML_ERROR, "Could not dirty metadata buffer. "
-                     "(bh->b_blocknr=%llu)\n",
-                     (unsigned long long)bh->b_blocknr);
-        mlog_exit(status);
+        mlog_exit_void();
-        return status;
 }
 #define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3f74e09b0d80..b5baaa8e710f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -325,8 +325,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
 *      <modify the bh>
 *      ocfs2_journal_dirty(handle, bh);
 */
-int                  ocfs2_journal_dirty(handle_t *handle,
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
-                                         struct buffer_head *bh);
 /*
 *  Credit Macros:
@@ -562,6 +561,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
        return blocks;
 }
+/*
+ * Allocating a discontiguous block group requires the credits from
+ * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
+ * the group descriptor's extent list.  The caller already has started
+ * the transaction with ocfs2_calc_group_alloc_credits().  They extend
+ * it with these credits.
+ */
+static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
+{
+        return ocfs2_extent_recs_per_gd(sb);
+}
 static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
                                                unsigned int clusters_to_del,
                                                struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ca992d91f511..3d7419682dc0 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                                             struct ocfs2_dinode *alloc,
-                                             u32 numbits);
+                                             u32 *numbits,
+                                             struct ocfs2_alloc_reservation *resv);
 static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
@@ -74,6 +75,144 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
                                          struct inode *local_alloc_inode);
+/*
+ * ocfs2_la_default_mb() - determine a default size, in megabytes of
+ * the local alloc.
+ *
+ * Generally, we'd like to pick as large a local alloc as
+ * possible. Performance on large workloads tends to scale
+ * proportionally to la size. In addition to that, the reservations
+ * code functions more efficiently as it can reserve more windows for
+ * write.
+ *
+ * Some things work against us when trying to choose a large local alloc:
+ *
+ * - We need to ensure our sizing is picked to leave enough space in
+ *   group descriptors for other allocations (such as block groups,
+ *   etc). Picking default sizes which are a multiple of 4 could help
+ *   - block groups are allocated in 2mb and 4mb chunks.
+ *
+ * - Likewise, we don't want to starve other nodes of bits on small
+ *   file systems. This can easily be taken care of by limiting our
+ *   default to a reasonable size (256M) on larger cluster sizes.
+ *
+ * - Some file systems can't support very large sizes - 4k and 8k in
+ *   particular are limited to less than 128 and 256 megabytes respectively.
+ *
+ * The following reference table shows group descriptor and local
+ * alloc maximums at various cluster sizes (4k blocksize)
+ *
+ * csize: 4K    group: 126M     la: 121M
+ * csize: 8K    group: 252M     la: 243M
+ * csize: 16K   group: 504M     la: 486M
+ * csize: 32K   group: 1008M    la: 972M
+ * csize: 64K   group: 2016M    la: 1944M
+ * csize: 128K  group: 4032M    la: 3888M
+ * csize: 256K  group: 8064M    la: 7776M
+ * csize: 512K  group: 16128M   la: 15552M
+ * csize: 1024K group: 32256M   la: 31104M
+ */
+#define OCFS2_LA_MAX_DEFAULT_MB 256
+#define OCFS2_LA_OLD_DEFAULT    8
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
+{
+        unsigned int la_mb;
+        unsigned int gd_mb;
+        unsigned int megs_per_slot;
+        struct super_block *sb = osb->sb;
+        gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
+                8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
+        /*
+         * This takes care of files systems with very small group
+         * descriptors - 512 byte blocksize at cluster sizes lower
+         * than 16K and also 1k blocksize with 4k cluster size.
+         */
+        if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
+            || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
+                return OCFS2_LA_OLD_DEFAULT;
+        /*
+         * Leave enough room for some block groups and make the final
+         * value we work from a multiple of 4.
+         */
+        gd_mb -= 16;
+        gd_mb &= 0xFFFFFFFB;
+        la_mb = gd_mb;
+        /*
+         * Keep window sizes down to a reasonable default
+         */
+        if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
+                /*
+                 * Some clustersize / blocksize combinations will have
+                 * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
+                 * default size, but get poor distribution when
+                 * limited to exactly 256 megabytes.
+                 *
+                 * As an example, 16K clustersize at 4K blocksize
+                 * gives us a cluster group size of 504M. Paring the
+                 * local alloc size down to 256 however, would give us
+                 * only one window and around 200MB left in the
+                 * cluster group. Instead, find the first size below
+                 * 256 which would give us an even distribution.
+                 *
+                 * Larger cluster group sizes actually work out pretty
+                 * well when pared to 256, so we don't have to do this
+                 * for any group that fits more than two
+                 * OCFS2_LA_MAX_DEFAULT_MB windows.
+                 */
+                if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
+                        la_mb = 256;
+                else {
+                        unsigned int gd_mult = gd_mb;
+                        while (gd_mult > 256)
+                                gd_mult = gd_mult >> 1;
+                        la_mb = gd_mult;
+                }
+        }
+        megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
+        megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
+        /* Too many nodes, too few disk clusters. */
+        if (megs_per_slot < la_mb)
+                la_mb = megs_per_slot;
+        return la_mb;
+}
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
+{
+        struct super_block *sb = osb->sb;
+        unsigned int la_default_mb = ocfs2_la_default_mb(osb);
+        unsigned int la_max_mb;
+        la_max_mb = ocfs2_clusters_to_megabytes(sb,
+                                                ocfs2_local_alloc_size(sb) * 8);
+        mlog(0, "requested: %dM, max: %uM, default: %uM\n",
+             requested_mb, la_max_mb, la_default_mb);
+        if (requested_mb == -1) {
+                /* No user request - use defaults */
+                osb->local_alloc_default_bits =
+                        ocfs2_megabytes_to_clusters(sb, la_default_mb);
+        } else if (requested_mb > la_max_mb) {
+                /* Request is too big, we give the maximum available */
+                osb->local_alloc_default_bits =
+                        ocfs2_megabytes_to_clusters(sb, la_max_mb);
+        } else {
+                osb->local_alloc_default_bits =
+                        ocfs2_megabytes_to_clusters(sb, requested_mb);
+        }
+        osb->local_alloc_bits = osb->local_alloc_default_bits;
+}
 static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
        return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -156,7 +295,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
                     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
                osb->local_alloc_bits =
                        ocfs2_megabytes_to_clusters(osb->sb,
-                                                    OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
+                                                    ocfs2_la_default_mb(osb));
        }
        /* read the alloc off disk */
@@ -262,6 +401,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        osb->local_alloc_state = OCFS2_LA_DISABLED;
+        ocfs2_resmap_uninit(&osb->osb_la_resmap);
        main_bm_inode = ocfs2_get_system_file_inode(osb,
                                                    GLOBAL_BITMAP_SYSTEM_INODE,
                                                    OCFS2_INVALID_SLOT);
@@ -305,12 +446,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        }
        ocfs2_clear_local_alloc(alloc);
+        ocfs2_journal_dirty(handle, bh);
-        status = ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_commit;
-        }
        brelse(bh);
        osb->local_alloc_bh = NULL;
@@ -481,46 +617,6 @@ out:
        return status;
 }
-/* Check to see if the local alloc window is within ac->ac_max_block */
-static int ocfs2_local_alloc_in_range(struct inode *inode,
-                                      struct ocfs2_alloc_context *ac,
-                                      u32 bits_wanted)
-{
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_dinode *alloc;
-        struct ocfs2_local_alloc *la;
-        int start;
-        u64 block_off;
-        if (!ac->ac_max_block)
-                return 1;
-        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
-        la = OCFS2_LOCAL_ALLOC(alloc);
-        start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
-        if (start == -1) {
-                mlog_errno(-ENOSPC);
-                return 0;
-        }
-        /*
-         * Converting (bm_off + start + bits_wanted) to blocks gives us
-         * the blkno just past our actual allocation.  This is perfect
-         * to compare with ac_max_block.
-         */
-        block_off = ocfs2_clusters_to_blocks(inode->i_sb,
-                                             le32_to_cpu(la->la_bm_off) +
-                                             start + bits_wanted);
-        mlog(0, "Checking %llu against %llu\n",
-             (unsigned long long)block_off,
-             (unsigned long long)ac->ac_max_block);
-        if (block_off > ac->ac_max_block)
-                return 0;
-        return 1;
-}
 /*
 * make sure we've got at least bits_wanted contiguous bits in the
 * local alloc. You lose them when you drop i_mutex.
@@ -613,17 +709,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
                mlog(0, "Calling in_range for max block %llu\n",
                     (unsigned long long)ac->ac_max_block);
-        if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
-                                        bits_wanted)) {
-                /*
-                 * The window is outside ac->ac_max_block.
-                 * This errno tells the caller to keep localalloc enabled
-                 * but to get the allocation from the main bitmap.
-                 */
-                status = -EFBIG;
-                goto bail;
-        }
        ac->ac_inode = local_alloc_inode;
        /* We should never use localalloc from another slot */
        ac->ac_alloc_slot = osb->slot_num;
@@ -664,7 +749,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
        la = OCFS2_LOCAL_ALLOC(alloc);
-        start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+        start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
+                                                  ac->ac_resv);
        if (start == -1) {
                /* TODO: Shouldn't we just BUG here? */
                status = -ENOSPC;
@@ -674,8 +760,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
        bitmap = la->la_bitmap;
        *bit_off = le32_to_cpu(la->la_bm_off) + start;
-        /* local alloc is always contiguous by nature -- we never
-         * delete bits from it! */
        *num_bits = bits_wanted;
        status = ocfs2_journal_access_di(handle,
@@ -687,18 +771,15 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
                goto bail;
        }
+        ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
+                                  bits_wanted);
        while(bits_wanted--)
                ocfs2_set_bit(start++, bitmap);
        le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
+        ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        status = 0;
 bail:
        mlog_exit(status);
        return status;
@@ -722,13 +803,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
 }
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
-                                             struct ocfs2_dinode *alloc,
+                                     struct ocfs2_dinode *alloc,
-                                             u32 numbits)
+                                     u32 *numbits,
+                                     struct ocfs2_alloc_reservation *resv)
 {
        int numfound, bitoff, left, startoff, lastzero;
+        int local_resv = 0;
+        struct ocfs2_alloc_reservation r;
        void *bitmap = NULL;
+        struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
-        mlog_entry("(numbits wanted = %u)\n", numbits);
+        mlog_entry("(numbits wanted = %u)\n", *numbits);
        if (!alloc->id1.bitmap1.i_total) {
                mlog(0, "No bits in my window!\n");
@@ -736,6 +821,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                goto bail;
        }
+        if (!resv) {
+                local_resv = 1;
+                ocfs2_resv_init_once(&r);
+                ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
+                resv = &r;
+        }
+        numfound = *numbits;
+        if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
+                if (numfound < *numbits)
+                        *numbits = numfound;
+                goto bail;
+        }
+        /*
+         * Code error. While reservations are enabled, local
+         * allocation should _always_ go through them.
+         */
+        BUG_ON(osb->osb_resv_level != 0);
+        /*
+         * Reservations are disabled. Handle this the old way.
+         */
        bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
        numfound = bitoff = startoff = 0;
@@ -761,7 +870,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                        startoff = bitoff+1;
                }
                /* we got everything we needed */
-                if (numfound == numbits) {
+                if (numfound == *numbits) {
                        /* mlog(0, "Found it all!\n"); */
                        break;
                }
@@ -770,12 +879,15 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
        mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
             numfound);
-        if (numfound == numbits)
+        if (numfound == *numbits)
                bitoff = startoff - numfound;
        else
                bitoff = -1;
 bail:
+        if (local_resv)
+                ocfs2_resv_discard(resmap, resv);
        mlog_exit(bitoff);
        return bitoff;
 }
@@ -872,8 +984,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
                             (unsigned long long)la_start_blk,
                             (unsigned long long)blkno);
-                        status = ocfs2_free_clusters(handle, main_bm_inode,
+                        status = ocfs2_release_clusters(handle,
-                                                     main_bm_bh, blkno, count);
+                                                        main_bm_inode,
+                                                        main_bm_bh, blkno,
+                                                        count);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -984,8 +1098,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
        }
 retry_enospc:
-        (*ac)->ac_bits_wanted = osb->local_alloc_bits;
+        (*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
        status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
        if (status == -ENOSPC) {
                if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1048,7 +1161,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
        /* we used the generic suballoc reserve function, but we set
         * everything up nicely, so there's no reason why we can't use
         * the more specific cluster api to claim bits. */
-        status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
+        status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
                                      &cluster_off, &cluster_count);
        if (status == -ENOSPC) {
 retry_enospc:
@@ -1061,7 +1174,8 @@ retry_enospc:
                    OCFS2_LA_DISABLED)
                        goto bail;
-                status = ocfs2_claim_clusters(osb, handle, ac,
+                ac->ac_bits_wanted = osb->local_alloc_default_bits;
+                status = ocfs2_claim_clusters(handle, ac,
                                              osb->local_alloc_bits,
                                              &cluster_off,
                                              &cluster_count);
@@ -1096,6 +1210,9 @@ retry_enospc:
        memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
               le16_to_cpu(la->la_size));
+        ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
+                             OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
        mlog(0, "New window allocated:\n");
        mlog(0, "window la_bm_off = %u\n",
             OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
@@ -1167,12 +1284,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        }
        ocfs2_clear_local_alloc(alloc);
+        ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
                                          main_bm_inode, main_bm_bh);
@@ -1190,7 +1302,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        atomic_inc(&osb->alloc_stats.moves);
-        status = 0;
 bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index ac5ea9f86653..1be9b5864460 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
 void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
 int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
                                     int node_num,
                                     struct ocfs2_dinode **alloc_copy);
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
        if (!(fl->fl_flags & FL_POSIX))
                return -ENOLCK;
-        if (__mandatory_lock(inode))
+        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
                return -ENOLCK;
        return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 39737613424a..af2b8fe1f139 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,7 +25,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/uio.h>
@@ -42,44 +41,20 @@
 #include "file.h"
 #include "inode.h"
 #include "mmap.h"
+#include "super.h"
-static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
-{
-        /* The best way to deal with signals in the vm path is
-         * to block them upfront, rather than allowing the
-         * locking paths to return -ERESTARTSYS. */
-        sigfillset(blocked);
-        /* We should technically never get a bad return value
-         * from sigprocmask */
-        return sigprocmask(SIG_BLOCK, blocked, oldset);
-}
-static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
-{
-        return sigprocmask(SIG_SETMASK, oldset, NULL);
-}
 static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 {
-        sigset_t blocked, oldset;
+        sigset_t oldset;
-        int error, ret;
+        int ret;
        mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
-        error = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+        ocfs2_block_signals(&oldset);
-        if (error < 0) {
-                mlog_errno(error);
-                ret = VM_FAULT_SIGBUS;
-                goto out;
-        }
        ret = filemap_fault(area, vmf);
+        ocfs2_unblock_signals(&oldset);
-        error = ocfs2_vm_op_unblock_sigs(&oldset);
-        if (error < 0)
-                mlog_errno(error);
-out:
        mlog_exit_ptr(vmf->page);
        return ret;
 }
@@ -159,14 +134,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct buffer_head *di_bh = NULL;
-        sigset_t blocked, oldset;
+        sigset_t oldset;
-        int ret, ret2;
+        int ret;
-        ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+        ocfs2_block_signals(&oldset);
-        if (ret < 0) {
-                mlog_errno(ret);
-                return ret;
-        }
        /*
         * The cluster locks taken will block a truncate from another
@@ -194,9 +165,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        ocfs2_inode_unlock(inode, 1);
 out:
-        ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
+        ocfs2_unblock_signals(&oldset);
-        if (ret2 < 0)
-                mlog_errno(ret2);
        if (ret)
                ret = VM_FAULT_SIGBUS;
        return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d9cd4e373a53..db5dd3ed4df4 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            handle_t *handle,
                            struct inode *inode,
-                            struct ocfs2_dinode *fe,
+                            struct buffer_head *fe_bh,
                            char *name,
                            struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode);
@@ -239,6 +239,8 @@ static int ocfs2_mknod(struct inode *dir,
        };
        int did_quota_inode = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        sigset_t oldset;
+        int did_block_signals = 0;
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
                   (unsigned long)dev, dentry->d_name.len,
@@ -350,6 +352,10 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
+        /* Starting to change things, restart is no longer possible. */
+        ocfs2_block_signals(&oldset);
+        did_block_signals = 1;
        status = dquot_alloc_inode(inode);
        if (status)
                goto leave;
@@ -384,11 +390,7 @@ static int ocfs2_mknod(struct inode *dir,
                        goto leave;
                }
                ocfs2_add_links_count(dirfe, 1);
-                status = ocfs2_journal_dirty(handle, parent_fe_bh);
+                ocfs2_journal_dirty(handle, parent_fe_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto leave;
-                }
                inc_nlink(dir);
        }
@@ -408,23 +410,28 @@ static int ocfs2_mknod(struct inode *dir,
                }
        }
-        status = ocfs2_add_entry(handle, dentry, inode,
+        /*
-                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+         * Do this before adding the entry to the directory. We add
-                                 &lookup);
+         * also set d_op after success so that ->d_iput() will cleanup
-        if (status < 0) {
+         * the dentry lock even if ocfs2_add_entry() fails below.
+         */
+        status = ocfs2_dentry_attach_lock(dentry, inode,
+                                          OCFS2_I(dir)->ip_blkno);
+        if (status) {
                mlog_errno(status);
                goto leave;
        }
+        dentry->d_op = &ocfs2_dentry_ops;
-        status = ocfs2_dentry_attach_lock(dentry, inode,
+        status = ocfs2_add_entry(handle, dentry, inode,
-                                          OCFS2_I(dir)->ip_blkno);
+                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
-        if (status) {
+                                 &lookup);
+        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
        insert_inode_hash(inode);
-        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
        status = 0;
 leave:
@@ -434,6 +441,8 @@ leave:
                ocfs2_commit_trans(osb, handle);
        ocfs2_inode_unlock(dir, 1);
+        if (did_block_signals)
+                ocfs2_unblock_signals(&oldset);
        if (status == -ENOSPC)
                mlog(0, "Disk is full\n");
@@ -445,11 +454,6 @@ leave:
        ocfs2_free_dir_lookup_result(&lookup);
-        if ((status < 0) && inode) {
-                clear_nlink(inode);
-                iput(inode);
-        }
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
@@ -459,6 +463,17 @@ leave:
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
+        /*
+         * We should call iput after the i_mutex of the bitmap been
+         * unlocked in ocfs2_free_alloc_context, or the
+         * ocfs2_delete_inode will mutex_lock again.
+         */
+        if ((status < 0) && inode) {
+                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
+                clear_nlink(inode);
+                iput(inode);
+        }
        mlog_exit(status);
        return status;
@@ -476,14 +491,15 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        int status = 0;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_extent_list *fel;
-        u64 fe_blkno = 0;
+        u64 suballoc_loc, fe_blkno = 0;
        u16 suballoc_bit;
        u16 feat;
        *new_fe_bh = NULL;
-        status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
+        status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
-                                       inode_ac, &suballoc_bit, &fe_blkno);
+                                       inode_ac, &suballoc_loc,
+                                       &suballoc_bit, &fe_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -520,6 +536,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_generation = cpu_to_le32(inode->i_generation);
        fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
        fe->i_blkno = cpu_to_le64(fe_blkno);
+        fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
        fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
        fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
        fe->i_uid = cpu_to_le32(inode->i_uid);
@@ -556,11 +573,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
        }
-        status = ocfs2_journal_dirty(handle, *new_fe_bh);
+        ocfs2_journal_dirty(handle, *new_fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        ocfs2_populate_inode(inode, fe, 1);
        ocfs2_ci_set_new(osb, INODE_CACHE(inode));
@@ -626,6 +639,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        sigset_t oldset;
        mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
                   old_dentry->d_name.len, old_dentry->d_name.name,
@@ -682,6 +696,9 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_unlock_inode;
        }
+        /* Starting to change things, restart is no longer possible. */
+        ocfs2_block_signals(&oldset);
        err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (err < 0) {
@@ -694,14 +711,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
        fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ocfs2_journal_dirty(handle, fe_bh);
-        err = ocfs2_journal_dirty(handle, fe_bh);
-        if (err < 0) {
-                ocfs2_add_links_count(fe, -1);
-                drop_nlink(inode);
-                mlog_errno(err);
-                goto out_commit;
-        }
        err = ocfs2_add_entry(handle, dentry, inode,
                              OCFS2_I(inode)->ip_blkno,
@@ -725,6 +735,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 out_commit:
        ocfs2_commit_trans(osb, handle);
+        ocfs2_unblock_signals(&oldset);
 out_unlock_inode:
        ocfs2_inode_unlock(inode, 1);
@@ -879,7 +890,7 @@ static int ocfs2_unlink(struct inode *dir,
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
        if (inode_is_unlinkable(inode)) {
-                status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
+                status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
                                          &orphan_insert, orphan_dir);
                if (status < 0) {
                        mlog_errno(status);
@@ -898,12 +909,7 @@ static int ocfs2_unlink(struct inode *dir,
                drop_nlink(inode);
        drop_nlink(inode);
        ocfs2_set_links_count(fe, inode->i_nlink);
+        ocfs2_journal_dirty(handle, fe_bh);
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        if (S_ISDIR(inode->i_mode))
@@ -1300,7 +1306,7 @@ static int ocfs2_rename(struct inode *old_dir,
                if (S_ISDIR(new_inode->i_mode) ||
                    (ocfs2_read_links_count(newfe) == 1)) {
                        status = ocfs2_orphan_add(osb, handle, new_inode,
-                                                  newfe, orphan_name,
+                                                  newfe_bh, orphan_name,
                                                  &orphan_insert, orphan_dir);
                        if (status < 0) {
                                mlog_errno(status);
@@ -1321,12 +1327,7 @@ static int ocfs2_rename(struct inode *old_dir,
                        ocfs2_set_links_count(newfe, 0);
                else
                        ocfs2_add_links_count(newfe, -1);
+                ocfs2_journal_dirty(handle, newfe_bh);
-                status = ocfs2_journal_dirty(handle, newfe_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
        } else {
                /* if the name was not found in new_dir, add it now */
                status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1345,10 +1346,7 @@ static int ocfs2_rename(struct inode *old_dir,
                old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
                old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
+                ocfs2_journal_dirty(handle, old_inode_bh);
-                status = ocfs2_journal_dirty(handle, old_inode_bh);
-                if (status < 0)
-                        mlog_errno(status);
        } else
                mlog_errno(status);
@@ -1420,7 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir,
                                                         OCFS2_JOURNAL_ACCESS_WRITE);
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
                        ocfs2_set_links_count(fe, old_dir->i_nlink);
-                        status = ocfs2_journal_dirty(handle, old_dir_bh);
+                        ocfs2_journal_dirty(handle, old_dir_bh);
                }
        }
        ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
@@ -1552,11 +1550,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
                       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
                       bytes_left);
-                status = ocfs2_journal_dirty(handle, bhs[virtual]);
+                ocfs2_journal_dirty(handle, bhs[virtual]);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                virtual++;
                p_blkno++;
@@ -1600,6 +1594,8 @@ static int ocfs2_symlink(struct inode *dir,
        };
        int did_quota = 0, did_quota_inode = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        sigset_t oldset;
+        int did_block_signals = 0;
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1695,6 +1691,10 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
+        /* Starting to change things, restart is no longer possible. */
+        ocfs2_block_signals(&oldset);
+        did_block_signals = 1;
        status = dquot_alloc_inode(inode);
        if (status)
                goto bail;
@@ -1771,22 +1771,27 @@ static int ocfs2_symlink(struct inode *dir,
                }
        }
-        status = ocfs2_add_entry(handle, dentry, inode,
+        /*
-                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
+         * Do this before adding the entry to the directory. We add
-                                 &lookup);
+         * also set d_op after success so that ->d_iput() will cleanup
-        if (status < 0) {
+         * the dentry lock even if ocfs2_add_entry() fails below.
+         */
+        status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+        if (status) {
                mlog_errno(status);
                goto bail;
        }
+        dentry->d_op = &ocfs2_dentry_ops;
-        status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+        status = ocfs2_add_entry(handle, dentry, inode,
-        if (status) {
+                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
+                                 &lookup);
+        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        insert_inode_hash(inode);
-        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
 bail:
        if (status < 0 && did_quota)
@@ -1798,6 +1803,8 @@ bail:
                ocfs2_commit_trans(osb, handle);
        ocfs2_inode_unlock(dir, 1);
+        if (did_block_signals)
+                ocfs2_unblock_signals(&oldset);
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
@@ -1811,6 +1818,7 @@ bail:
        if (xattr_ac)
                ocfs2_free_alloc_context(xattr_ac);
        if ((status < 0) && inode) {
+                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
                clear_nlink(inode);
                iput(inode);
        }
@@ -1911,7 +1919,7 @@ leave:
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            handle_t *handle,
                            struct inode *inode,
-                            struct ocfs2_dinode *fe,
+                            struct buffer_head *fe_bh,
                            char *name,
                            struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode)
@@ -1919,6 +1927,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        struct buffer_head *orphan_dir_bh = NULL;
        int status = 0;
        struct ocfs2_dinode *orphan_fe;
+        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -1943,29 +1952,42 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        if (S_ISDIR(inode->i_mode))
                ocfs2_add_links_count(orphan_fe, 1);
        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+        ocfs2_journal_dirty(handle, orphan_dir_bh);
-        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
+                                   OCFS2_ORPHAN_NAMELEN, inode,
+                                   OCFS2_I(inode)->ip_blkno,
+                                   orphan_dir_bh, lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
+        /*
-                                   OCFS2_ORPHAN_NAMELEN, inode,
+         * We're going to journal the change of i_flags and i_orphaned_slot.
-                                   OCFS2_I(inode)->ip_blkno,
+         * It's safe anyway, though some callers may duplicate the journaling.
-                                   orphan_dir_bh, lookup);
+         * Journaling within the func just make the logic look more
+         * straightforward.
+         */
+        status = ocfs2_journal_access_di(handle,
+                                         INODE_CACHE(inode),
+                                         fe_bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
        le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+        OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
        /* Record which orphan dir our inode now resides
         * in. delete_inode will use this to determine which orphan
         * dir to lock. */
        fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+        ocfs2_journal_dirty(handle, fe_bh);
        mlog(0, "Inode %llu orphaned in slot %d\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
@@ -2029,12 +2051,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        if (S_ISDIR(inode->i_mode))
                ocfs2_add_links_count(orphan_fe, -1);
        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+        ocfs2_journal_dirty(handle, orphan_dir_bh);
-        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
 leave:
        ocfs2_free_dir_lookup_result(&lookup);
@@ -2123,7 +2140,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
        }
        di = (struct ocfs2_dinode *)new_di_bh->b_data;
-        status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name,
+        status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
                                  &orphan_insert, orphan_dir);
        if (status < 0) {
                mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1238b491db90..c67003b6b5a2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -47,6 +47,7 @@
 /* For struct ocfs2_blockcheck_stats */
 #include "blockcheck.h"
+#include "reservations.h"
 /* Caching of metadata buffers */
@@ -341,6 +342,9 @@ struct ocfs2_super
         */
        unsigned int local_alloc_bits;
        unsigned int local_alloc_default_bits;
+        /* osb_clusters_at_boot can become stale! Do not trust it to
+         * be up to date. */
+        unsigned int osb_clusters_at_boot;
        enum ocfs2_local_alloc_state local_alloc_state; /* protected
                                                         * by osb_lock */
@@ -349,6 +353,11 @@ struct ocfs2_super
        u64 la_last_gd;
+        struct ocfs2_reservation_map    osb_la_resmap;
+        unsigned int    osb_resv_level;
+        unsigned int    osb_dir_resv_level;
        /* Next three fields are for local node slot recovery during
         * mount. */
        int dirty;
@@ -482,6 +491,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+                return 1;
+        return 0;
+}
 static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
 {
        if (ocfs2_supports_indexed_dirs(osb))
@@ -763,8 +779,24 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
        return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
 }
-#define ocfs2_set_bit ext2_set_bit
+static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
-#define ocfs2_clear_bit ext2_clear_bit
+                                                       unsigned int clusters)
+{
+        return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
+{
+        ext2_set_bit(bit, bitmap);
+}
+#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
+static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
+{
+        ext2_clear_bit(bit, bitmap);
+}
+#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
 #define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index bb37218a7978..33f1c9a8258d 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -100,7 +100,8 @@
                                         | OCFS2_FEATURE_INCOMPAT_XATTR \
                                         | OCFS2_FEATURE_INCOMPAT_META_ECC \
                                         | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
-                                         | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+                                         | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
+                                         | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -165,6 +166,9 @@
 /* Refcount tree support */
 #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE    0x1000
+/* Discontigous block groups */
+#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG     0x2000
 /*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
@@ -283,14 +287,6 @@
 #define OCFS2_MIN_JOURNAL_SIZE          (4 * 1024 * 1024)
 /*
- * Default local alloc size (in megabytes)
- *
- * The value chosen should be such that most allocations, including new
- * block groups, use local alloc.
- */
-#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE  8
-/*
 * Inline extended attribute size (in bytes)
 * The value chosen should be aligned to 16 byte boundaries.
 */
@@ -512,7 +508,10 @@ struct ocfs2_extent_block
                                           block group */
        __le32 h_fs_generation;         /* Must match super block */
        __le64 h_blkno;                 /* Offset on disk, in blocks */
-/*20*/  __le64 h_reserved3;
+/*20*/  __le64 h_suballoc_loc;          /* Suballocator block group this
+                                           eb belongs to.  Only valid
+                                           if allocated from a
+                                           discontiguous block group */
        __le64 h_next_leaf_blk;         /* Offset on disk, in blocks,
                                           of next leaf header pointing
                                           to data */
@@ -679,7 +678,11 @@ struct ocfs2_dinode {
 /*80*/  struct ocfs2_block_check i_check;       /* Error checking */
 /*88*/  __le64 i_dx_root;               /* Pointer to dir index root block */
 /*90*/  __le64 i_refcount_loc;
-        __le64 i_reserved2[4];
+        __le64 i_suballoc_loc;          /* Suballocator block group this
+                                           inode belongs to.  Only valid
+                                           if allocated from a
+                                           discontiguous block group */
+/*A0*/  __le64 i_reserved2[3];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -814,7 +817,12 @@ struct ocfs2_dx_root_block {
        __le32          dr_reserved2;
        __le64          dr_free_blk;            /* Pointer to head of free
                                                 * unindexed block list. */
-        __le64          dr_reserved3[15];
+        __le64          dr_suballoc_loc;        /* Suballocator block group
+                                                   this root belongs to.
+                                                   Only valid if allocated
+                                                   from a discontiguous
+                                                   block group */
+        __le64          dr_reserved3[14];
        union {
                struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
                                                   * bits for maximum space
@@ -840,6 +848,13 @@ struct ocfs2_dx_leaf {
 };
 /*
+ * Largest bitmap for a block (suballocator) group in bytes.  This limit
+ * does not affect cluster groups (global allocator).  Cluster group
+ * bitmaps run to the end of the block.
+ */
+#define OCFS2_MAX_BG_BITMAP_SIZE        256
+/*
 * On disk allocator group structure for OCFS2
 */
 struct ocfs2_group_desc
@@ -860,7 +875,29 @@ struct ocfs2_group_desc
        __le64   bg_blkno;               /* Offset on disk, in blocks */
 /*30*/  struct ocfs2_block_check bg_check;      /* Error checking */
        __le64   bg_reserved2;
-/*40*/  __u8    bg_bitmap[0];
+/*40*/  union {
+                __u8    bg_bitmap[0];
+                struct {
+                        /*
+                         * Block groups may be discontiguous when
+                         * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
+                         * The extents of a discontigous block group are
+                         * stored in bg_list.  It is a flat list.
+                         * l_tree_depth must always be zero.  A
+                         * discontiguous group is signified by a non-zero
+                         * bg_list->l_next_free_rec.  Only block groups
+                         * can be discontiguous; Cluster groups cannot.
+                         * We've never made a block group with more than
+                         * 2048 blocks (256 bytes of bg_bitmap).  This
+                         * codifies that limit so that we can fit bg_list.
+                         * bg_size of a discontiguous block group will
+                         * be 256 to match bg_bitmap_filler.
+                         */
+                        __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
+/*140*/                 struct ocfs2_extent_list bg_list;
+                };
+        };
+/* Actual on-disk size is one block */
 };
 struct ocfs2_refcount_rec {
@@ -905,7 +942,11 @@ struct ocfs2_refcount_block {
 /*40*/  __le32 rf_generation;           /* generation number. all be the same
                                         * for the same refcount tree. */
        __le32 rf_reserved0;
-        __le64 rf_reserved1[7];
+        __le64 rf_suballoc_loc;         /* Suballocator block group this
+                                           refcount block belongs to. Only
+                                           valid if allocated from a
+                                           discontiguous block group */
+/*50*/  __le64 rf_reserved1[6];
 /*80*/  union {
                struct ocfs2_refcount_list rf_records;  /* List of refcount
                                                          records */
@@ -1017,7 +1058,10 @@ struct ocfs2_xattr_block {
                                        real xattr or a xattr tree. */
        __le16  xb_reserved0;
        __le32  xb_reserved1;
-        __le64  xb_reserved2;
+        __le64  xb_suballoc_loc;        /* Suballocator block group this
+                                           xattr block belongs to. Only
+                                           valid if allocated from a
+                                           discontiguous block group */
 /*30*/  union {
                struct ocfs2_xattr_header xb_header; /* xattr header if this
                                                        block contains xattr */
@@ -1254,6 +1298,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
        return size / sizeof(struct ocfs2_extent_rec);
 }
+static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+        return size / sizeof(struct ocfs2_extent_rec);
+}
 static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
 {
        int size;
@@ -1284,13 +1338,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
        return size;
 }
-static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+static inline int ocfs2_group_bitmap_size(struct super_block *sb,
+                                          int suballocator,
+                                          u32 feature_incompat)
 {
-        int size;
+        int size = sb->s_blocksize -
-        size = sb->s_blocksize -
                offsetof(struct ocfs2_group_desc, bg_bitmap);
+        /*
+         * The cluster allocator uses the entire block.  Suballocators have
+         * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+         * code expects bg_size set to the maximum.  Thus we must keep
+         * bg_size as-is unless discontig_bg is enabled.
+         */
+        if (suballocator &&
+            (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+                size = OCFS2_MAX_BG_BITMAP_SIZE;
        return size;
 }
@@ -1402,23 +1466,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
        return size / sizeof(struct ocfs2_extent_rec);
 }
-static inline int ocfs2_local_alloc_size(int blocksize)
+static inline int ocfs2_extent_recs_per_gd(int blocksize)
 {
        int size;
        size = blocksize -
-                offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+                offsetof(struct ocfs2_group_desc, bg_list.l_recs);
-        return size;
+        return size / sizeof(struct ocfs2_extent_rec);
 }
-static inline int ocfs2_group_bitmap_size(int blocksize)
+static inline int ocfs2_local_alloc_size(int blocksize)
 {
        int size;
        size = blocksize -
+                offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+        return size;
+}
+static inline int ocfs2_group_bitmap_size(int blocksize,
+                                          int suballocator,
+                                          uint32_t feature_incompat)
+{
+        int size = sb->s_blocksize -
                offsetof(struct ocfs2_group_desc, bg_bitmap);
+        /*
+         * The cluster allocator uses the entire block.  Suballocators have
+         * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+         * code expects bg_size set to the maximum.  Thus we must keep
+         * bg_size as-is unless discontig_bg is enabled.
+         */
+        if (suballocator &&
+            (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+                size = OCFS2_MAX_BG_BITMAP_SIZE;
        return size;
 }
@@ -1491,5 +1575,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
        de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
+static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
+{
+        if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
+             le16_to_cpu(gd->bg_size)) !=
+            offsetof(struct ocfs2_group_desc, bg_list))
+                return 0;
+        /*
+         * Only valid to check l_next_free_rec if
+         * bg_bitmap + bg_size == bg_list.
+         */
+        if (!gd->bg_list.l_next_free_rec)
+                return 0;
+        return 1;
+}
 #endif  /* _OCFS2_FS_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 355f41d1d520..04ae76d8c6ab 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,6 +3,7 @@
 */
 #include <linux/spinlock.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/dqblk_qtree.h>
@@ -260,10 +261,8 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
                brelse(bh);
                goto out;
        }
-        err = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
        brelse(bh);
-        if (err < 0)
-                goto out;
 out:
        if (err) {
                mutex_unlock(&gqinode->i_mutex);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index a6467f3d262e..884b641f199e 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,6 +3,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/module.h>
@@ -118,12 +119,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
        lock_buffer(bh);
        modify(bh, private);
        unlock_buffer(bh);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                ocfs2_commit_trans(OCFS2_SB(sb), handle);
-                return status;
-        }
        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
        if (status < 0) {
                mlog_errno(status);
@@ -522,9 +519,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
                        ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
                        le32_add_cpu(&dchunk->dqc_free, 1);
                        unlock_buffer(qbh);
-                        status = ocfs2_journal_dirty(handle, qbh);
+                        ocfs2_journal_dirty(handle, qbh);
-                        if (status < 0)
-                                mlog_errno(status);
 out_commit:
                        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
                        ocfs2_commit_trans(OCFS2_SB(sb), handle);
@@ -630,9 +625,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
                lock_buffer(bh);
                ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
                unlock_buffer(bh);
-                status = ocfs2_journal_dirty(handle, bh);
+                ocfs2_journal_dirty(handle, bh);
-                if (status < 0)
-                        mlog_errno(status);
 out_trans:
                ocfs2_commit_trans(osb, handle);
 out_bh:
@@ -1008,11 +1001,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
               sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
               OCFS2_QBLK_RESERVED_SPACE);
        unlock_buffer(bh);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Initialize new block with structures */
        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
@@ -1039,11 +1028,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        lock_buffer(dbh);
        memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
        unlock_buffer(dbh);
-        status = ocfs2_journal_dirty(handle, dbh);
+        ocfs2_journal_dirty(handle, dbh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Update local quotafile info */
        oinfo->dqi_blocks += 2;
@@ -1154,11 +1139,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        lock_buffer(bh);
        memset(bh->b_data, 0, sb->s_blocksize);
        unlock_buffer(bh);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Update chunk header */
        status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
                                         chunk->qc_headerbh,
@@ -1172,11 +1154,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        lock_buffer(chunk->qc_headerbh);
        le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
        unlock_buffer(chunk->qc_headerbh);
-        status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+        ocfs2_journal_dirty(handle, chunk->qc_headerbh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Update file header */
        oinfo->dqi_blocks++;
        status = ocfs2_local_write_info(sb, type);
@@ -1311,12 +1290,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
        ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
        le32_add_cpu(&dchunk->dqc_free, 1);
        unlock_buffer(od->dq_chunk->qc_headerbh);
-        status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+        ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out;
-        }
-        status = 0;
 out:
        /* Clear the read bit so that next time someone uses this
         * dquot he reads fresh info from disk and allocates local
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9e96921dffda..4793f36f6518 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -37,7 +37,6 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
@@ -571,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
        struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 first_blkno;
+        u64 suballoc_loc, first_blkno;
        BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
@@ -597,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
                goto out_commit;
        }
-        ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &first_blkno);
        if (ret) {
@@ -627,6 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
        memset(rb, 0, inode->i_sb->s_blocksize);
        strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
        rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
        rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -791,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
        if (le32_to_cpu(rb->rf_count) == 1) {
                blk = le64_to_cpu(rb->rf_blkno);
                bit = le16_to_cpu(rb->rf_suballoc_bit);
-                bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+                if (rb->rf_suballoc_loc)
+                        bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
+                else
+                        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
                alloc_inode = ocfs2_get_system_file_inode(osb,
                                        EXTENT_ALLOC_SYSTEM_INODE,
@@ -1269,9 +1272,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
        } else if (merge)
                ocfs2_refcount_rec_merge(rb, index);
-        ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+        ocfs2_journal_dirty(handle, ref_leaf_bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        return ret;
 }
@@ -1285,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 blkno;
+        u64 suballoc_loc, blkno;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        struct buffer_head *new_bh = NULL;
        struct ocfs2_refcount_block *new_rb;
@@ -1299,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
                goto out;
        }
-        ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &blkno);
        if (ret) {
@@ -1331,6 +1332,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
        new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        new_rb->rf_blkno = cpu_to_le64(blkno);
        new_rb->rf_cpos = cpu_to_le32(0);
@@ -1525,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got, new_cpos;
-        u64 blkno;
+        u64 suballoc_loc, blkno;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        struct ocfs2_refcount_block *root_rb =
                        (struct ocfs2_refcount_block *)ref_root_bh->b_data;
@@ -1549,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
                goto out;
        }
-        ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &blkno);
        if (ret) {
@@ -1577,6 +1579,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
        memset(new_rb, 0, sb->s_blocksize);
        strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
        new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1695,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle,
         * 2 more credits, one for the leaf refcount block, one for
         * the extent block contains the extent rec.
         */
-        ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
+        ret = ocfs2_extend_trans(handle, 2);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1803,11 +1806,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
        if (merge)
                ocfs2_refcount_rec_merge(rb, index);
-        ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+        ocfs2_journal_dirty(handle, ref_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        if (index == 0) {
                ret = ocfs2_adjust_refcount_rec(handle, ci,
@@ -1978,9 +1977,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
                        ocfs2_refcount_rec_merge(rb, index);
        }
-        ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+        ocfs2_journal_dirty(handle, ref_leaf_bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        brelse(new_bh);
@@ -2113,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
         */
        ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
                                        le16_to_cpu(rb->rf_suballoc_slot),
+                                        le64_to_cpu(rb->rf_suballoc_loc),
                                        le64_to_cpu(rb->rf_blkno),
                                        le16_to_cpu(rb->rf_suballoc_bit));
        if (ret) {
@@ -2517,20 +2515,19 @@ out:
 *
 * Normally the refcount blocks store these refcount should be
 * contiguous also, so that we can get the number easily.
- * As for meta_ac, we will at most add split 2 refcount record and
+ * We will at most add split 2 refcount records and 2 more
- * 2 more refcount block, so just check it in a rough way.
+ * refcount blocks, so just check it in a rough way.
 *
 * Caller must hold refcount tree lock.
 */
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
-                                          struct buffer_head *di_bh,
+                                          u64 refcount_loc,
                                          u64 phys_blkno,
                                          u32 clusters,
                                          int *credits,
-                                          struct ocfs2_alloc_context **meta_ac)
+                                          int *ref_blocks)
 {
-        int ret, ref_blocks = 0;
+        int ret;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct buffer_head *ref_root_bh = NULL;
        struct ocfs2_refcount_tree *tree;
@@ -2547,14 +2544,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
        BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
        ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
-                                      le64_to_cpu(di->i_refcount_loc), &tree);
+                                      refcount_loc, &tree);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_read_refcount_block(&tree->rf_ci,
+        ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
-                                        le64_to_cpu(di->i_refcount_loc),
                                        &ref_root_bh);
        if (ret) {
                mlog_errno(ret);
@@ -2565,21 +2561,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
                                               &tree->rf_ci,
                                               ref_root_bh,
                                               start_cpos, clusters,
-                                               &ref_blocks, credits);
+                                               ref_blocks, credits);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        mlog(0, "reserve new metadata %d, credits = %d\n",
+        mlog(0, "reserve new metadata %d blocks, credits = %d\n",
-             ref_blocks, *credits);
+             *ref_blocks, *credits);
-        if (ref_blocks) {
-                ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
-                                                        ref_blocks, meta_ac);
-                if (ret)
-                        mlog_errno(ret);
-        }
 out:
        brelse(ref_root_bh);
@@ -3041,11 +3030,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
                }
                memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
-                ret = ocfs2_journal_dirty(handle, new_bh);
+                ocfs2_journal_dirty(handle, new_bh);
-                if (ret) {
-                        mlog_errno(ret);
-                        break;
-                }
                brelse(new_bh);
                brelse(old_bh);
@@ -3283,7 +3268,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
                } else {
                        delete = 1;
-                        ret = __ocfs2_claim_clusters(osb, handle,
+                        ret = __ocfs2_claim_clusters(handle,
                                                     context->data_ac,
                                                     1, set_len,
                                                     &new_bit, &new_len);
@@ -4075,6 +4060,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
        OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
        spin_unlock(&OCFS2_I(t_inode)->ip_lock);
        i_size_write(t_inode, size);
+        t_inode->i_blocks = s_inode->i_blocks;
        di->i_xattr_inline_size = s_di->i_xattr_inline_size;
        di->i_clusters = s_di->i_clusters;
@@ -4083,6 +4069,9 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
        di->i_attr = s_di->i_attr;
        if (preserve) {
+                t_inode->i_uid = s_inode->i_uid;
+                t_inode->i_gid = s_inode->i_gid;
+                t_inode->i_mode = s_inode->i_mode;
                di->i_uid = s_di->i_uid;
                di->i_gid = s_di->i_gid;
                di->i_mode = s_di->i_mode;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c1d19b1d3ecc..9983ba1570e2 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -47,11 +47,11 @@ int ocfs2_decrease_refcount(struct inode *inode,
                            struct ocfs2_cached_dealloc_ctxt *dealloc,
                            int delete);
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
-                                          struct buffer_head *di_bh,
+                                          u64 refcount_loc,
                                          u64 phys_blkno,
                                          u32 clusters,
                                          int *credits,
-                                          struct ocfs2_alloc_context **meta_ac);
+                                          int *ref_blocks);
 int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
                       u32 cpos, u32 write_len, u32 max_cpos);
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 000000000000..40650021fc24
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,847 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.c
+ *
+ * Allocation reservations implementation
+ *
+ * Some code borrowed from fs/ext3/balloc.c and is:
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * The rest is copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#define MLOG_MASK_PREFIX ML_RESERVATIONS
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#ifdef CONFIG_OCFS2_DEBUG_FS
+#define OCFS2_CHECK_RESERVATIONS
+#endif
+DEFINE_SPINLOCK(resv_lock);
+#define OCFS2_MIN_RESV_WINDOW_BITS      8
+#define OCFS2_MAX_RESV_WINDOW_BITS      1024
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+        return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
+static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
+                                           struct ocfs2_alloc_reservation *resv)
+{
+        struct ocfs2_super *osb = resmap->m_osb;
+        unsigned int bits;
+        if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
+                /* 8, 16, 32, 64, 128, 256, 512, 1024 */
+                bits = 4 << osb->osb_resv_level;
+        } else {
+                bits = 4 << osb->osb_dir_resv_level;
+        }
+        return bits;
+}
+static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
+{
+        if (resv->r_len)
+                return resv->r_start + resv->r_len - 1;
+        return resv->r_start;
+}
+static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
+{
+        return !!(resv->r_len == 0);
+}
+static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
+{
+        if (resmap->m_osb->osb_resv_level == 0)
+                return 1;
+        return 0;
+}
+static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
+{
+        struct ocfs2_super *osb = resmap->m_osb;
+        struct rb_node *node;
+        struct ocfs2_alloc_reservation *resv;
+        int i = 0;
+        mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
+             osb->dev_str, resmap->m_bitmap_len);
+        node = rb_first(&resmap->m_reservations);
+        while (node) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
+                     "\tlast_len: %u\n", resv->r_start,
+                     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+                     resv->r_last_len);
+                node = rb_next(node);
+                i++;
+        }
+        mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
+        i = 0;
+        list_for_each_entry(resv, &resmap->m_lru, r_lru) {
+                mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
+                     "last_start: %u\tlast_len: %u\n", i, resv->r_start,
+                     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+                     resv->r_last_len);
+                i++;
+        }
+}
+#ifdef OCFS2_CHECK_RESERVATIONS
+static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
+                                      int i,
+                                      struct ocfs2_alloc_reservation *resv)
+{
+        char *disk_bitmap = resmap->m_disk_bitmap;
+        unsigned int start = resv->r_start;
+        unsigned int end = ocfs2_resv_end(resv);
+        while (start <= end) {
+                if (ocfs2_test_bit(start, disk_bitmap)) {
+                        mlog(ML_ERROR,
+                             "reservation %d covers an allocated area "
+                             "starting at bit %u!\n", i, start);
+                        return 1;
+                }
+                start++;
+        }
+        return 0;
+}
+static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+        unsigned int off = 0;
+        int i = 0;
+        struct rb_node *node;
+        struct ocfs2_alloc_reservation *resv;
+        node = rb_first(&resmap->m_reservations);
+        while (node) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                if (i > 0 && resv->r_start <= off) {
+                        mlog(ML_ERROR, "reservation %d has bad start off!\n",
+                             i);
+                        goto bad;
+                }
+                if (resv->r_len == 0) {
+                        mlog(ML_ERROR, "reservation %d has no length!\n",
+                             i);
+                        goto bad;
+                }
+                if (resv->r_start > ocfs2_resv_end(resv)) {
+                        mlog(ML_ERROR, "reservation %d has invalid range!\n",
+                             i);
+                        goto bad;
+                }
+                if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
+                        mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
+                             i);
+                        goto bad;
+                }
+                if (ocfs2_validate_resmap_bits(resmap, i, resv))
+                        goto bad;
+                off = ocfs2_resv_end(resv);
+                node = rb_next(node);
+                i++;
+        }
+        return;
+bad:
+        ocfs2_dump_resv(resmap);
+        BUG();
+}
+#else
+static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+}
+#endif
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
+{
+        memset(resv, 0, sizeof(*resv));
+        INIT_LIST_HEAD(&resv->r_lru);
+}
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+                         unsigned int flags)
+{
+        BUG_ON(flags & ~OCFS2_RESV_TYPES);
+        resv->r_flags |= flags;
+}
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+                      struct ocfs2_reservation_map *resmap)
+{
+        memset(resmap, 0, sizeof(*resmap));
+        resmap->m_osb = osb;
+        resmap->m_reservations = RB_ROOT;
+        /* m_bitmap_len is initialized to zero by the above memset. */
+        INIT_LIST_HEAD(&resmap->m_lru);
+        return 0;
+}
+static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
+                                struct ocfs2_alloc_reservation *resv)
+{
+        assert_spin_locked(&resv_lock);
+        if (!list_empty(&resv->r_lru))
+                list_del_init(&resv->r_lru);
+        list_add_tail(&resv->r_lru, &resmap->m_lru);
+}
+static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
+{
+        resv->r_len = 0;
+        resv->r_start = 0;
+}
+static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
+                              struct ocfs2_alloc_reservation *resv)
+{
+        if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
+                list_del_init(&resv->r_lru);
+                rb_erase(&resv->r_node, &resmap->m_reservations);
+                resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
+        }
+}
+static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                                 struct ocfs2_alloc_reservation *resv)
+{
+        assert_spin_locked(&resv_lock);
+        __ocfs2_resv_trunc(resv);
+        /*
+         * last_len and last_start no longer make sense if
+         * we're changing the range of our allocations.
+         */
+        resv->r_last_len = resv->r_last_start = 0;
+        ocfs2_resv_remove(resmap, resv);
+}
+/* does nothing if 'resv' is null */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                        struct ocfs2_alloc_reservation *resv)
+{
+        if (resv) {
+                spin_lock(&resv_lock);
+                __ocfs2_resv_discard(resmap, resv);
+                spin_unlock(&resv_lock);
+        }
+}
+static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
+{
+        struct rb_node *node;
+        struct ocfs2_alloc_reservation *resv;
+        assert_spin_locked(&resv_lock);
+        while ((node = rb_last(&resmap->m_reservations)) != NULL) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                __ocfs2_resv_discard(resmap, resv);
+        }
+}
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+                          unsigned int clen, char *disk_bitmap)
+{
+        if (ocfs2_resmap_disabled(resmap))
+                return;
+        spin_lock(&resv_lock);
+        ocfs2_resmap_clear_all_resv(resmap);
+        resmap->m_bitmap_len = clen;
+        resmap->m_disk_bitmap = disk_bitmap;
+        spin_unlock(&resv_lock);
+}
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
+{
+        /* Does nothing for now. Keep this around for API symmetry */
+}
+static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
+                              struct ocfs2_alloc_reservation *new)
+{
+        struct rb_root *root = &resmap->m_reservations;
+        struct rb_node *parent = NULL;
+        struct rb_node **p = &root->rb_node;
+        struct ocfs2_alloc_reservation *tmp;
+        assert_spin_locked(&resv_lock);
+        mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
+             new->r_len);
+        while (*p) {
+                parent = *p;
+                tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
+                if (new->r_start < tmp->r_start) {
+                        p = &(*p)->rb_left;
+                        /*
+                         * This is a good place to check for
+                         * overlapping reservations.
+                         */
+                        BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
+                } else if (new->r_start > ocfs2_resv_end(tmp)) {
+                        p = &(*p)->rb_right;
+                } else {
+                        /* This should never happen! */
+                        mlog(ML_ERROR, "Duplicate reservation window!\n");
+                        BUG();
+                }
+        }
+        rb_link_node(&new->r_node, parent, p);
+        rb_insert_color(&new->r_node, root);
+        new->r_flags |= OCFS2_RESV_FLAG_INUSE;
+        ocfs2_resv_mark_lru(resmap, new);
+        ocfs2_check_resmap(resmap);
+}
+/**
+ * ocfs2_find_resv_lhs() - find the window which contains goal
+ * @resmap: reservation map to search
+ * @goal: which bit to search for
+ *
+ * If a window containing that goal is not found, we return the window
+ * which comes before goal. Returns NULL on empty rbtree or no window
+ * before goal.
+ */
+static struct ocfs2_alloc_reservation *
+ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
+{
+        struct ocfs2_alloc_reservation *resv = NULL;
+        struct ocfs2_alloc_reservation *prev_resv = NULL;
+        struct rb_node *node = resmap->m_reservations.rb_node;
+        assert_spin_locked(&resv_lock);
+        if (!node)
+                return NULL;
+        node = rb_first(&resmap->m_reservations);
+        while (node) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
+                        break;
+                /* Check if we overshot the reservation just before goal? */
+                if (resv->r_start > goal) {
+                        resv = prev_resv;
+                        break;
+                }
+                prev_resv = resv;
+                node = rb_next(node);
+        }
+        return resv;
+}
+/*
+ * We are given a range within the bitmap, which corresponds to a gap
+ * inside the reservations tree (search_start, search_len). The range
+ * can be anything from the whole bitmap, to a gap between
+ * reservations.
+ *
+ * The start value of *rstart is insignificant.
+ *
+ * This function searches the bitmap range starting at search_start
+ * with length search_len for a set of contiguous free bits. We try
+ * to find up to 'wanted' bits, but can sometimes return less.
+ *
+ * Returns the length of allocation, 0 if no free bits are found.
+ *
+ * *cstart and *clen will also be populated with the result.
+ */
+static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
+                                       unsigned int wanted,
+                                       unsigned int search_start,
+                                       unsigned int search_len,
+                                       unsigned int *rstart,
+                                       unsigned int *rlen)
+{
+        void *bitmap = resmap->m_disk_bitmap;
+        unsigned int best_start, best_len = 0;
+        int offset, start, found;
+        mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
+             wanted, search_start, search_len, resmap->m_bitmap_len);
+        found = best_start = best_len = 0;
+        start = search_start;
+        while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
+                                                 start)) != -1) {
+                /* Search reached end of the region */
+                if (offset >= (search_start + search_len))
+                        break;
+                if (offset == start) {
+                        /* we found a zero */
+                        found++;
+                        /* move start to the next bit to test */
+                        start++;
+                } else {
+                        /* got a zero after some ones */
+                        found = 1;
+                        start = offset + 1;
+                }
+                if (found > best_len) {
+                        best_len = found;
+                        best_start = start - found;
+                }
+                if (found >= wanted)
+                        break;
+        }
+        if (best_len == 0)
+                return 0;
+        if (best_len >= wanted)
+                best_len = wanted;
+        *rlen = best_len;
+        *rstart = best_start;
+        mlog(0, "Found start: %u len: %u\n", best_start, best_len);
+        return *rlen;
+}
+static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+                                     struct ocfs2_alloc_reservation *resv,
+                                     unsigned int goal, unsigned int wanted)
+{
+        struct rb_root *root = &resmap->m_reservations;
+        unsigned int gap_start, gap_end, gap_len;
+        struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+        struct rb_node *prev, *next;
+        unsigned int cstart, clen;
+        unsigned int best_start = 0, best_len = 0;
+        /*
+         * Nasty cases to consider:
+         *
+         * - rbtree is empty
+         * - our window should be first in all reservations
+         * - our window should be last in all reservations
+         * - need to make sure we don't go past end of bitmap
+         */
+        mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
+             resv->r_start, ocfs2_resv_end(resv), goal, wanted);
+        assert_spin_locked(&resv_lock);
+        if (RB_EMPTY_ROOT(root)) {
+                /*
+                 * Easiest case - empty tree. We can just take
+                 * whatever window of free bits we want.
+                 */
+                mlog(0, "Empty root\n");
+                clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+                                                   resmap->m_bitmap_len - goal,
+                                                   &cstart, &clen);
+                /*
+                 * This should never happen - the local alloc window
+                 * will always have free bits when we're called.
+                 */
+                BUG_ON(goal == 0 && clen == 0);
+                if (clen == 0)
+                        return;
+                resv->r_start = cstart;
+                resv->r_len = clen;
+                ocfs2_resv_insert(resmap, resv);
+                return;
+        }
+        prev_resv = ocfs2_find_resv_lhs(resmap, goal);
+        if (prev_resv == NULL) {
+                mlog(0, "Goal on LHS of leftmost window\n");
+                /*
+                 * A NULL here means that the search code couldn't
+                 * find a window that starts before goal.
+                 *
+                 * However, we can take the first window after goal,
+                 * which is also by definition, the leftmost window in
+                 * the entire tree. If we can find free bits in the
+                 * gap between goal and the LHS window, then the
+                 * reservation can safely be placed there.
+                 *
+                 * Otherwise we fall back to a linear search, checking
+                 * the gaps in between windows for a place to
+                 * allocate.
+                 */
+                next = rb_first(root);
+                next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
+                                     r_node);
+                /*
+                 * The search should never return such a window. (see
+                 * comment above
+                 */
+                if (next_resv->r_start <= goal) {
+                        mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
+                             goal, next_resv->r_start, next_resv->r_len);
+                        ocfs2_dump_resv(resmap);
+                        BUG();
+                }
+                clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+                                                   next_resv->r_start - goal,
+                                                   &cstart, &clen);
+                if (clen) {
+                        best_len = clen;
+                        best_start = cstart;
+                        if (best_len == wanted)
+                                goto out_insert;
+                }
+                prev_resv = next_resv;
+                next_resv = NULL;
+        }
+        prev = &prev_resv->r_node;
+        /* Now we do a linear search for a window, starting at 'prev_rsv' */
+        while (1) {
+                next = rb_next(prev);
+                if (next) {
+                        mlog(0, "One more resv found in linear search\n");
+                        next_resv = rb_entry(next,
+                                             struct ocfs2_alloc_reservation,
+                                             r_node);
+                        gap_start = ocfs2_resv_end(prev_resv) + 1;
+                        gap_end = next_resv->r_start - 1;
+                        gap_len = gap_end - gap_start + 1;
+                } else {
+                        mlog(0, "No next node\n");
+                        /*
+                         * We're at the rightmost edge of the
+                         * tree. See if a reservation between this
+                         * window and the end of the bitmap will work.
+                         */
+                        gap_start = ocfs2_resv_end(prev_resv) + 1;
+                        gap_len = resmap->m_bitmap_len - gap_start;
+                        gap_end = resmap->m_bitmap_len - 1;
+                }
+                /*
+                 * No need to check this gap if we have already found
+                 * a larger region of free bits.
+                 */
+                if (gap_len <= best_len)
+                        goto next_resv;
+                clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
+                                                   gap_len, &cstart, &clen);
+                if (clen == wanted) {
+                        best_len = clen;
+                        best_start = cstart;
+                        goto out_insert;
+                } else if (clen > best_len) {
+                        best_len = clen;
+                        best_start = cstart;
+                }
+next_resv:
+                if (!next)
+                        break;
+                prev = next;
+                prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
+                                     r_node);
+        }
+out_insert:
+        if (best_len) {
+                resv->r_start = best_start;
+                resv->r_len = best_len;
+                ocfs2_resv_insert(resmap, resv);
+        }
+}
+static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
+                                   struct ocfs2_alloc_reservation *resv,
+                                   unsigned int wanted)
+{
+        struct ocfs2_alloc_reservation *lru_resv;
+        int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
+        unsigned int min_bits;
+        if (!tmpwindow)
+                min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
+        else
+                min_bits = wanted; /* We at know the temp window will use all
+                                    * of these bits */
+        /*
+         * Take the first reservation off the LRU as our 'target'. We
+         * don't try to be smart about it. There might be a case for
+         * searching based on size but I don't have enough data to be
+         * sure. --Mark (3/16/2010)
+         */
+        lru_resv = list_first_entry(&resmap->m_lru,
+                                    struct ocfs2_alloc_reservation, r_lru);
+        mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
+             lru_resv->r_len, ocfs2_resv_end(lru_resv));
+        /*
+         * Cannibalize (some or all) of the target reservation and
+         * feed it to the current window.
+         */
+        if (lru_resv->r_len <= min_bits) {
+                /*
+                 * Discard completely if size is less than or equal to a
+                 * reasonable threshold - 50% of window bits for non temporary
+                 * windows.
+                 */
+                resv->r_start = lru_resv->r_start;
+                resv->r_len = lru_resv->r_len;
+                __ocfs2_resv_discard(resmap, lru_resv);
+        } else {
+                unsigned int shrink;
+                if (tmpwindow)
+                        shrink = min_bits;
+                else
+                        shrink = lru_resv->r_len / 2;
+                lru_resv->r_len -= shrink;
+                resv->r_start = ocfs2_resv_end(lru_resv) + 1;
+                resv->r_len = shrink;
+        }
+        mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+             "r_len: %u r_last_start: %u r_last_len: %u\n",
+             resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+             resv->r_last_start, resv->r_last_len);
+        ocfs2_resv_insert(resmap, resv);
+}
+static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+                                   struct ocfs2_alloc_reservation *resv,
+                                   unsigned int wanted)
+{
+        unsigned int goal = 0;
+        BUG_ON(!ocfs2_resv_empty(resv));
+        /*
+         * Begin by trying to get a window as close to the previous
+         * one as possible. Using the most recent allocation as a
+         * start goal makes sense.
+         */
+        if (resv->r_last_len) {
+                goal = resv->r_last_start + resv->r_last_len;
+                if (goal >= resmap->m_bitmap_len)
+                        goal = 0;
+        }
+        __ocfs2_resv_find_window(resmap, resv, goal, wanted);
+        /* Search from last alloc didn't work, try once more from beginning. */
+        if (ocfs2_resv_empty(resv) && goal != 0)
+                __ocfs2_resv_find_window(resmap, resv, 0, wanted);
+        if (ocfs2_resv_empty(resv)) {
+                /*
+                 * Still empty? Pull oldest one off the LRU, remove it from
+                 * tree, put this one in it's place.
+                 */
+                ocfs2_cannibalize_resv(resmap, resv, wanted);
+        }
+        BUG_ON(ocfs2_resv_empty(resv));
+}
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+                           struct ocfs2_alloc_reservation *resv,
+                           int *cstart, int *clen)
+{
+        unsigned int wanted = *clen;
+        if (resv == NULL || ocfs2_resmap_disabled(resmap))
+                return -ENOSPC;
+        spin_lock(&resv_lock);
+        /*
+         * We don't want to over-allocate for temporary
+         * windows. Otherwise, we run the risk of fragmenting the
+         * allocation space.
+         */
+        wanted = ocfs2_resv_window_bits(resmap, resv);
+        if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+                wanted = *clen;
+        if (ocfs2_resv_empty(resv)) {
+                mlog(0, "empty reservation, find new window\n");
+                /*
+                 * Try to get a window here. If it works, we must fall
+                 * through and test the bitmap . This avoids some
+                 * ping-ponging of windows due to non-reserved space
+                 * being allocation before we initialize a window for
+                 * that inode.
+                 */
+                ocfs2_resv_find_window(resmap, resv, wanted);
+        }
+        BUG_ON(ocfs2_resv_empty(resv));
+        *cstart = resv->r_start;
+        *clen = resv->r_len;
+        spin_unlock(&resv_lock);
+        return 0;
+}
+static void
+        ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
+                                     struct ocfs2_alloc_reservation *resv,
+                                     unsigned int start, unsigned int end)
+{
+        unsigned int rhs = 0;
+        unsigned int old_end = ocfs2_resv_end(resv);
+        BUG_ON(start != resv->r_start || old_end < end);
+        /*
+         * Completely used? We can remove it then.
+         */
+        if (old_end == end) {
+                __ocfs2_resv_discard(resmap, resv);
+                return;
+        }
+        rhs = old_end - end;
+        /*
+         * This should have been trapped above.
+         */
+        BUG_ON(rhs == 0);
+        resv->r_start = end + 1;
+        resv->r_len = old_end - resv->r_start + 1;
+}
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+                               struct ocfs2_alloc_reservation *resv,
+                               u32 cstart, u32 clen)
+{
+        unsigned int cend = cstart + clen - 1;
+        if (resmap == NULL || ocfs2_resmap_disabled(resmap))
+                return;
+        if (resv == NULL)
+                return;
+        BUG_ON(cstart != resv->r_start);
+        spin_lock(&resv_lock);
+        mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
+             "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
+             cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
+             resv->r_len, resv->r_last_start, resv->r_last_len);
+        BUG_ON(cstart < resv->r_start);
+        BUG_ON(cstart > ocfs2_resv_end(resv));
+        BUG_ON(cend > ocfs2_resv_end(resv));
+        ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
+        resv->r_last_start = cstart;
+        resv->r_last_len = clen;
+        /*
+         * May have been discarded above from
+         * ocfs2_adjust_resv_from_alloc().
+         */
+        if (!ocfs2_resv_empty(resv))
+                ocfs2_resv_mark_lru(resmap, resv);
+        mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+             "r_len: %u r_last_start: %u r_last_len: %u\n",
+             resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+             resv->r_last_start, resv->r_last_len);
+        ocfs2_check_resmap(resmap);
+        spin_unlock(&resv_lock);
+}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 000000000000..1e49cc29d06c
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.h
+ *
+ * Allocation reservations function prototypes and structures.
+ *
+ * Copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_RESERVATIONS_H
+#define OCFS2_RESERVATIONS_H
+#include <linux/rbtree.h>
+#define OCFS2_DEFAULT_RESV_LEVEL        2
+#define OCFS2_MAX_RESV_LEVEL    9
+#define OCFS2_MIN_RESV_LEVEL    0
+struct ocfs2_alloc_reservation {
+        struct rb_node  r_node;
+        unsigned int    r_start;        /* Begining of current window */
+        unsigned int    r_len;          /* Length of the window */
+        unsigned int    r_last_len;     /* Length of most recent alloc */
+        unsigned int    r_last_start;   /* Start of most recent alloc */
+        struct list_head        r_lru;  /* LRU list head */
+        unsigned int    r_flags;
+};
+#define OCFS2_RESV_FLAG_INUSE   0x01    /* Set when r_node is part of a btree */
+#define OCFS2_RESV_FLAG_TMP     0x02    /* Temporary reservation, will be
+                                         * destroyed immedately after use */
+#define OCFS2_RESV_FLAG_DIR     0x04    /* Reservation is for an unindexed
+                                         * directory btree */
+struct ocfs2_reservation_map {
+        struct rb_root          m_reservations;
+        char                    *m_disk_bitmap;
+        struct ocfs2_super      *m_osb;
+        /* The following are not initialized to meaningful values until a disk
+         * bitmap is provided. */
+        u32                     m_bitmap_len;   /* Number of valid
+                                                 * bits available */
+        struct list_head        m_lru;          /* LRU of reservations
+                                                 * structures. */
+};
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
+#define OCFS2_RESV_TYPES        (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+                         unsigned int flags);
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+/**
+ * ocfs2_resv_discard() - truncate a reservation
+ * @resmap:
+ * @resv: the reservation to truncate.
+ *
+ * After this function is called, the reservation will be empty, and
+ * unlinked from the rbtree.
+ */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                        struct ocfs2_alloc_reservation *resv);
+/**
+ * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @resmap: struct ocfs2_reservation_map to initialize
+ * @obj: unused for now
+ * @ops: unused for now
+ * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
+ *
+ * Only possible return value other than '0' is -ENOMEM for failure to
+ * allocation mirror bitmap.
+ */
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+                      struct ocfs2_reservation_map *resmap);
+/**
+ * ocfs2_resmap_restart() - "restart" a reservation bitmap
+ * @resmap: reservations bitmap
+ * @clen: Number of valid bits in the bitmap
+ * @disk_bitmap: the disk bitmap this resmap should refer to.
+ *
+ * Re-initialize the parameters of a reservation bitmap. This is
+ * useful for local alloc window slides.
+ *
+ * This function will call ocfs2_trunc_resv against all existing
+ * reservations. A future version will recalculate existing
+ * reservations based on the new bitmap.
+ */
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+                          unsigned int clen, char *disk_bitmap);
+/**
+ * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
+ * @resmap: the struct ocfs2_reservation_map to uninitialize
+ */
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
+/**
+ * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
+ * @resmap: reservations bitmap
+ * @resv: reservation to base search from
+ * @cstart: start of proposed allocation
+ * @clen: length (in clusters) of proposed allocation
+ *
+ * Using the reservation data from resv, this function will compare
+ * resmap and resmap->m_disk_bitmap to determine what part (if any) of
+ * the reservation window is still clear to use. If resv is empty,
+ * this function will try to allocate a window for it.
+ *
+ * On success, zero is returned and the valid allocation area is set in cstart
+ * and clen.
+ *
+ * Returns -ENOSPC if reservations are disabled.
+ */
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+                           struct ocfs2_alloc_reservation *resv,
+                           int *cstart, int *clen);
+/**
+ * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
+ * @resmap: reservations bitmap
+ * @resv: optional reservation to recalulate based on new bitmap
+ * @cstart: start of allocation in clusters
+ * @clen: end of allocation in clusters.
+ *
+ * Tell the reservation code that bits were used to fulfill allocation in
+ * resmap. The bits don't have to have been part of any existing
+ * reservation. But we must always call this function when bits are claimed.
+ * Internally, the reservations code will use this information to mark the
+ * reservations bitmap. If resv is passed, it's next allocation window will be
+ * calculated. It also expects that 'cstart' is the same as we passed back
+ * from ocfs2_resmap_resv_bits().
+ */
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+                               struct ocfs2_alloc_reservation *resv,
+                               u32 cstart, u32 clen);
+#endif  /* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 3c3d673a4d20..dacd553d8617 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -134,11 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
                le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
        }
-        ret = ocfs2_journal_dirty(handle, group_bh);
+        ocfs2_journal_dirty(handle, group_bh);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_rollback;
-        }
        /* update the inode accordingly. */
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
@@ -319,7 +315,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
-                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
+                ocfs2_group_bitmap_size(osb->sb, 0,
+                                        osb->s_feature_incompat) * 8) {
                mlog(ML_ERROR, "The disk is too old and small. "
                     "Force to do offline resize.");
                ret = -EINVAL;
@@ -500,7 +497,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
-                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
+                ocfs2_group_bitmap_size(osb->sb, 0,
+                                        osb->s_feature_incompat) * 8) {
                mlog(ML_ERROR, "The disk is too old and small."
                     " Force to do offline resize.");
                ret = -EINVAL;
@@ -545,12 +543,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        group = (struct ocfs2_group_desc *)group_bh->b_data;
        group->bg_next_group = cr->c_blkno;
+        ocfs2_journal_dirty(handle, group_bh);
-        ret = ocfs2_journal_dirty(handle, group_bh);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
                                      main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 7020e1253ffa..0d3049f696c5 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 /* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 5ae8812b2864..2dc57bca0688 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c3c60bc3e072..f4c2a9eb8c4d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -53,6 +53,15 @@
 #define OCFS2_MAX_TO_STEAL              1024
+struct ocfs2_suballoc_result {
+        u64             sr_bg_blkno;    /* The bg we allocated from.  Set
+                                           to 0 when a block group is
+                                           contiguous. */
+        u64             sr_blkno;       /* The first allocated block */
+        unsigned int    sr_bit_offset;  /* The bit in the bg */
+        unsigned int    sr_bits;        /* How many bits we claimed */
+};
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -60,6 +69,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
                                  struct inode *alloc_inode,
                                  struct buffer_head *bg_bh,
                                  u64 group_blkno,
+                                  unsigned int group_clusters,
                                  u16 my_chain,
                                  struct ocfs2_chain_list *cl);
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
@@ -73,20 +83,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
                                      u32 bits_wanted, u32 min_bits,
                                      u64 max_block,
-                                      u16 *bit_off, u16 *bits_found);
+                                      struct ocfs2_suballoc_result *res);
 static int ocfs2_block_group_search(struct inode *inode,
                                    struct buffer_head *group_bh,
                                    u32 bits_wanted, u32 min_bits,
                                    u64 max_block,
-                                    u16 *bit_off, u16 *bits_found);
+                                    struct ocfs2_suballoc_result *res);
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
-                                     struct ocfs2_alloc_context *ac,
                                     handle_t *handle,
                                     u32 bits_wanted,
                                     u32 min_bits,
-                                     u16 *bit_off,
+                                     struct ocfs2_suballoc_result *res);
-                                     unsigned int *num_bits,
-                                     u64 *bg_blkno);
 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
                                         int nr);
 static inline int ocfs2_block_group_set_bits(handle_t *handle,
@@ -95,13 +102,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
                                             struct buffer_head *group_bh,
                                             unsigned int bit_off,
                                             unsigned int num_bits);
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
-                                               struct inode *alloc_inode,
-                                               struct ocfs2_group_desc *bg,
-                                               struct buffer_head *group_bh,
-                                               unsigned int bit_off,
-                                               unsigned int num_bits);
 static int ocfs2_relink_block_group(handle_t *handle,
                                    struct inode *alloc_inode,
                                    struct buffer_head *fe_bh,
@@ -137,6 +137,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
        }
        brelse(ac->ac_bh);
        ac->ac_bh = NULL;
+        ac->ac_resv = NULL;
 }
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -152,7 +153,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 #define do_error(fmt, ...)                                              \
        do{                                                             \
-                if (clean_error)                                        \
+                if (resize)                                     \
                        mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
                else                                                    \
                        ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
@@ -160,7 +161,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 static int ocfs2_validate_gd_self(struct super_block *sb,
                                  struct buffer_head *bh,
-                                  int clean_error)
+                                  int resize)
 {
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -211,7 +212,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
 static int ocfs2_validate_gd_parent(struct super_block *sb,
                                    struct ocfs2_dinode *di,
                                    struct buffer_head *bh,
-                                    int clean_error)
+                                    int resize)
 {
        unsigned int max_bits;
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +234,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
                return -EINVAL;
        }
-        if (le16_to_cpu(gd->bg_chain) >=
+        /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
-            le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
+        if ((le16_to_cpu(gd->bg_chain) >
+             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
+            ((le16_to_cpu(gd->bg_chain) ==
+             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
                do_error("Group descriptor #%llu has bad chain %u",
                         (unsigned long long)bh->b_blocknr,
                         le16_to_cpu(gd->bg_chain));
@@ -329,14 +333,38 @@ out:
        return rc;
 }
+static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
+                                          struct ocfs2_group_desc *bg,
+                                          struct ocfs2_chain_list *cl,
+                                          u64 p_blkno, u32 clusters)
+{
+        struct ocfs2_extent_list *el = &bg->bg_list;
+        struct ocfs2_extent_rec *rec;
+        BUG_ON(!ocfs2_supports_discontig_bg(osb));
+        if (!el->l_next_free_rec)
+                el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
+        rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
+        rec->e_blkno = cpu_to_le64(p_blkno);
+        rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
+                                  le16_to_cpu(cl->cl_bpc));
+        rec->e_leaf_clusters = cpu_to_le32(clusters);
+        le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
+        le16_add_cpu(&bg->bg_free_bits_count,
+                     clusters * le16_to_cpu(cl->cl_bpc));
+        le16_add_cpu(&el->l_next_free_rec, 1);
+}
 static int ocfs2_block_group_fill(handle_t *handle,
                                  struct inode *alloc_inode,
                                  struct buffer_head *bg_bh,
                                  u64 group_blkno,
+                                  unsigned int group_clusters,
                                  u16 my_chain,
                                  struct ocfs2_chain_list *cl)
 {
        int status = 0;
+        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        struct super_block * sb = alloc_inode->i_sb;
@@ -363,19 +391,23 @@ static int ocfs2_block_group_fill(handle_t *handle,
        memset(bg, 0, sb->s_blocksize);
        strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
        bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
-        bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
+        bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
-        bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+                                                osb->s_feature_incompat));
        bg->bg_chain = cpu_to_le16(my_chain);
        bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
        bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
        bg->bg_blkno = cpu_to_le64(group_blkno);
+        if (group_clusters == le16_to_cpu(cl->cl_cpg))
+                bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+        else
+                ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
+                                              group_clusters);
        /* set the 1st bit in the bitmap to account for the descriptor block */
        ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
        bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
-        status = ocfs2_journal_dirty(handle, bg_bh);
+        ocfs2_journal_dirty(handle, bg_bh);
-        if (status < 0)
-                mlog_errno(status);
        /* There is no need to zero out or otherwise initialize the
         * other blocks in a group - All valid FS metadata in a block
@@ -401,6 +433,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
        return best;
 }
+static struct buffer_head *
+ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
+                               struct inode *alloc_inode,
+                               struct ocfs2_alloc_context *ac,
+                               struct ocfs2_chain_list *cl)
+{
+        int status;
+        u32 bit_off, num_bits;
+        u64 bg_blkno;
+        struct buffer_head *bg_bh;
+        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+        status = ocfs2_claim_clusters(handle, ac,
+                                      le16_to_cpu(cl->cl_cpg), &bit_off,
+                                      &num_bits);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto bail;
+        }
+        /* setup the group */
+        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+        mlog(0, "new descriptor, record %u, at block %llu\n",
+             alloc_rec, (unsigned long long)bg_blkno);
+        bg_bh = sb_getblk(osb->sb, bg_blkno);
+        if (!bg_bh) {
+                status = -EIO;
+                mlog_errno(status);
+                goto bail;
+        }
+        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+                                        bg_blkno, num_bits, alloc_rec, cl);
+        if (status < 0) {
+                brelse(bg_bh);
+                mlog_errno(status);
+        }
+bail:
+        return status ? ERR_PTR(status) : bg_bh;
+}
+static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
+                                        handle_t *handle,
+                                        struct ocfs2_alloc_context *ac,
+                                        unsigned int min_bits,
+                                        u32 *bit_off, u32 *num_bits)
+{
+        int status = 0;
+        while (min_bits) {
+                status = ocfs2_claim_clusters(handle, ac, min_bits,
+                                              bit_off, num_bits);
+                if (status != -ENOSPC)
+                        break;
+                min_bits >>= 1;
+        }
+        return status;
+}
+static int ocfs2_block_group_grow_discontig(handle_t *handle,
+                                            struct inode *alloc_inode,
+                                            struct buffer_head *bg_bh,
+                                            struct ocfs2_alloc_context *ac,
+                                            struct ocfs2_chain_list *cl,
+                                            unsigned int min_bits)
+{
+        int status;
+        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+        struct ocfs2_group_desc *bg =
+                (struct ocfs2_group_desc *)bg_bh->b_data;
+        unsigned int needed = le16_to_cpu(cl->cl_cpg) -
+                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+        u32 p_cpos, clusters;
+        u64 p_blkno;
+        struct ocfs2_extent_list *el = &bg->bg_list;
+        status = ocfs2_journal_access_gd(handle,
+                                         INODE_CACHE(alloc_inode),
+                                         bg_bh,
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
+                                le16_to_cpu(el->l_count))) {
+                if (min_bits > needed)
+                        min_bits = needed;
+                status = ocfs2_block_group_claim_bits(osb, handle, ac,
+                                                      min_bits, &p_cpos,
+                                                      &clusters);
+                if (status < 0) {
+                        if (status != -ENOSPC)
+                                mlog_errno(status);
+                        goto bail;
+                }
+                p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
+                ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
+                                              clusters);
+                min_bits = clusters;
+                needed = le16_to_cpu(cl->cl_cpg) -
+                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+        }
+        if (needed > 0) {
+                /*
+                 * We have used up all the extent rec but can't fill up
+                 * the cpg. So bail out.
+                 */
+                status = -ENOSPC;
+                goto bail;
+        }
+        ocfs2_journal_dirty(handle, bg_bh);
+bail:
+        return status;
+}
+static void ocfs2_bg_alloc_cleanup(handle_t *handle,
+                                   struct ocfs2_alloc_context *cluster_ac,
+                                   struct inode *alloc_inode,
+                                   struct buffer_head *bg_bh)
+{
+        int i, ret;
+        struct ocfs2_group_desc *bg;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        if (!bg_bh)
+                return;
+        bg = (struct ocfs2_group_desc *)bg_bh->b_data;
+        el = &bg->bg_list;
+        for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+                rec = &el->l_recs[i];
+                ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
+                                          cluster_ac->ac_bh,
+                                          le64_to_cpu(rec->e_blkno),
+                                          le32_to_cpu(rec->e_leaf_clusters));
+                if (ret)
+                        mlog_errno(ret);
+                /* Try all the clusters to free */
+        }
+        ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
+        brelse(bg_bh);
+}
+static struct buffer_head *
+ocfs2_block_group_alloc_discontig(handle_t *handle,
+                                  struct inode *alloc_inode,
+                                  struct ocfs2_alloc_context *ac,
+                                  struct ocfs2_chain_list *cl)
+{
+        int status;
+        u32 bit_off, num_bits;
+        u64 bg_blkno;
+        unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
+        struct buffer_head *bg_bh = NULL;
+        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+        if (!ocfs2_supports_discontig_bg(osb)) {
+                status = -ENOSPC;
+                goto bail;
+        }
+        status = ocfs2_extend_trans(handle,
+                                    ocfs2_calc_bg_discontig_credits(osb->sb));
+        if (status) {
+                mlog_errno(status);
+                goto bail;
+        }
+        /*
+         * We're going to be grabbing from multiple cluster groups.
+         * We don't have enough credits to relink them all, and the
+         * cluster groups will be staying in cache for the duration of
+         * this operation.
+         */
+        ac->ac_allow_chain_relink = 0;
+        /* Claim the first region */
+        status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
+                                              &bit_off, &num_bits);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto bail;
+        }
+        min_bits = num_bits;
+        /* setup the group */
+        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+        mlog(0, "new descriptor, record %u, at block %llu\n",
+             alloc_rec, (unsigned long long)bg_blkno);
+        bg_bh = sb_getblk(osb->sb, bg_blkno);
+        if (!bg_bh) {
+                status = -EIO;
+                mlog_errno(status);
+                goto bail;
+        }
+        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+                                        bg_blkno, num_bits, alloc_rec, cl);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
+                                                  bg_bh, ac, cl, min_bits);
+        if (status)
+                mlog_errno(status);
+bail:
+        if (status)
+                ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
+        return status ? ERR_PTR(status) : bg_bh;
+}
 /*
 * We expect the block group allocator to already be locked.
 */
@@ -416,9 +680,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        struct ocfs2_chain_list *cl;
        struct ocfs2_alloc_context *ac = NULL;
        handle_t *handle = NULL;
-        u32 bit_off, num_bits;
        u16 alloc_rec;
-        u64 bg_blkno;
        struct buffer_head *bg_bh = NULL;
        struct ocfs2_group_desc *bg;
@@ -451,44 +713,20 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                     (unsigned long long)*last_alloc_group);
                ac->ac_last_group = *last_alloc_group;
        }
-        status = ocfs2_claim_clusters(osb,
-                                      handle,
+        bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
-                                      ac,
+                                               ac, cl);
-                                      le16_to_cpu(cl->cl_cpg),
+        if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
-                                      &bit_off,
+                bg_bh = ocfs2_block_group_alloc_discontig(handle,
-                                      &num_bits);
+                                                          alloc_inode,
-        if (status < 0) {
+                                                          ac, cl);
+        if (IS_ERR(bg_bh)) {
+                status = PTR_ERR(bg_bh);
+                bg_bh = NULL;
                if (status != -ENOSPC)
                        mlog_errno(status);
                goto bail;
        }
-        alloc_rec = ocfs2_find_smallest_chain(cl);
-        /* setup the group */
-        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-        mlog(0, "new descriptor, record %u, at block %llu\n",
-             alloc_rec, (unsigned long long)bg_blkno);
-        bg_bh = sb_getblk(osb->sb, bg_blkno);
-        if (!bg_bh) {
-                status = -EIO;
-                mlog_errno(status);
-                goto bail;
-        }
-        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
-        status = ocfs2_block_group_fill(handle,
-                                        alloc_inode,
-                                        bg_bh,
-                                        bg_blkno,
-                                        alloc_rec,
-                                        cl);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
@@ -498,10 +736,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                goto bail;
        }
+        alloc_rec = le16_to_cpu(bg->bg_chain);
        le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
                     le16_to_cpu(bg->bg_free_bits_count));
-        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
+        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
-        cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
+                     le16_to_cpu(bg->bg_bits));
+        cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg->bg_blkno);
        if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
                le16_add_cpu(&cl->cl_next_free_rec, 1);
@@ -510,11 +750,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
        le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
        OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -764,7 +1000,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
                                             EXTENT_ALLOC_SYSTEM_INODE,
                                             (u32)osb->slot_num, NULL,
-                                             ALLOC_NEW_GROUP);
+                                             ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
        if (status >= 0) {
@@ -950,11 +1186,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
                status = ocfs2_reserve_local_alloc_bits(osb,
                                                        bits_wanted,
                                                        *ac);
-                if (status == -EFBIG) {
+                if ((status < 0) && (status != -ENOSPC)) {
-                        /* The local alloc window is outside ac_max_block.
-                         * use the main bitmap. */
-                        status = -ENOSPC;
-                } else if ((status < 0) && (status != -ENOSPC)) {
                        mlog_errno(status);
                        goto bail;
                }
@@ -1037,8 +1269,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
                                             struct buffer_head *bg_bh,
                                             unsigned int bits_wanted,
                                             unsigned int total_bits,
-                                             u16 *bit_off,
+                                             struct ocfs2_suballoc_result *res)
-                                             u16 *bits_found)
 {
        void *bitmap;
        u16 best_offset, best_size;
@@ -1082,14 +1313,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
                }
        }
-        /* XXX: I think the first clause is equivalent to the second
+        if (best_size) {
-         *      - jlbec */
+                res->sr_bit_offset = best_offset;
-        if (found == bits_wanted) {
+                res->sr_bits = best_size;
-                *bit_off = start - found;
-                *bits_found = found;
-        } else if (best_size) {
-                *bit_off = best_offset;
-                *bits_found = best_size;
        } else {
                status = -ENOSPC;
                /* No error log here -- see the comment above
@@ -1133,16 +1359,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        }
        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
        while(num_bits--)
                ocfs2_set_bit(bit_off++, bitmap);
-        status = ocfs2_journal_dirty(handle,
+        ocfs2_journal_dirty(handle, group_bh);
-                                     group_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
 bail:
        mlog_exit(status);
@@ -1206,12 +1426,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
        prev_bg->bg_next_group = bg->bg_next_group;
+        ocfs2_journal_dirty(handle, prev_bg_bh);
-        status = ocfs2_journal_dirty(handle, prev_bg_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_rollback;
-        }
        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
                                         bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1221,12 +1436,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
        bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+        ocfs2_journal_dirty(handle, bg_bh);
-        status = ocfs2_journal_dirty(handle, bg_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_rollback;
-        }
        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
                                         fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1236,14 +1446,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
        fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+        ocfs2_journal_dirty(handle, fe_bh);
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_rollback;
-        }
-        status = 0;
 out_rollback:
        if (status < 0) {
                fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
@@ -1267,14 +1471,13 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
                                      u32 bits_wanted, u32 min_bits,
                                      u64 max_block,
-                                      u16 *bit_off, u16 *bits_found)
+                                      struct ocfs2_suballoc_result *res)
 {
        int search = -ENOSPC;
        int ret;
        u64 blkoff;
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        u16 tmp_off, tmp_found;
        unsigned int max_bits, gd_cluster_off;
        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -1301,15 +1504,15 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
                                                        group_bh, bits_wanted,
-                                                        max_bits,
+                                                        max_bits, res);
-                                                        &tmp_off, &tmp_found);
                if (ret)
                        return ret;
                if (max_block) {
                        blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
                                                          gd_cluster_off +
-                                                          tmp_off + tmp_found);
+                                                          res->sr_bit_offset +
+                                                          res->sr_bits);
                        mlog(0, "Checking %llu against %llu\n",
                             (unsigned long long)blkoff,
                             (unsigned long long)max_block);
@@ -1321,16 +1524,14 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                 * return success, but we still want to return
                 * -ENOSPC unless it found the minimum number
                 * of bits. */
-                if (min_bits <= tmp_found) {
+                if (min_bits <= res->sr_bits)
-                        *bit_off = tmp_off;
-                        *bits_found = tmp_found;
                        search = 0; /* success */
-                } else if (tmp_found) {
+                else if (res->sr_bits) {
                        /*
                         * Don't show bits which we'll be returning
                         * for allocation to the local alloc bitmap.
                         */
-                        ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
+                        ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
                }
        }
@@ -1341,7 +1542,7 @@ static int ocfs2_block_group_search(struct inode *inode,
                                    struct buffer_head *group_bh,
                                    u32 bits_wanted, u32 min_bits,
                                    u64 max_block,
-                                    u16 *bit_off, u16 *bits_found)
+                                    struct ocfs2_suballoc_result *res)
 {
        int ret = -ENOSPC;
        u64 blkoff;
@@ -1354,10 +1555,10 @@ static int ocfs2_block_group_search(struct inode *inode,
                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
                                                        group_bh, bits_wanted,
                                                        le16_to_cpu(bg->bg_bits),
-                                                        bit_off, bits_found);
+                                                        res);
                if (!ret && max_block) {
-                        blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
+                        blkoff = le64_to_cpu(bg->bg_blkno) +
-                                *bits_found;
+                                res->sr_bit_offset + res->sr_bits;
                        mlog(0, "Checking %llu against %llu\n",
                             (unsigned long long)blkoff,
                             (unsigned long long)max_block);
@@ -1390,33 +1591,76 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
+        ocfs2_journal_dirty(handle, di_bh);
-        ret = ocfs2_journal_dirty(handle, di_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out:
        return ret;
 }
+static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
+                                         struct ocfs2_extent_rec *rec,
+                                         struct ocfs2_chain_list *cl)
+{
+        unsigned int bpc = le16_to_cpu(cl->cl_bpc);
+        unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
+        unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
+        if (res->sr_bit_offset < bitoff)
+                return 0;
+        if (res->sr_bit_offset >= (bitoff + bitcount))
+                return 0;
+        res->sr_blkno = le64_to_cpu(rec->e_blkno) +
+                (res->sr_bit_offset - bitoff);
+        if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
+                res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
+        return 1;
+}
+static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
+                                          struct ocfs2_group_desc *bg,
+                                          struct ocfs2_suballoc_result *res)
+{
+        int i;
+        u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+        struct ocfs2_chain_list *cl = &di->id2.i_chain;
+        if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
+                res->sr_blkno = 0;
+                return;
+        }
+        res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
+        res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
+        if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
+            !bg->bg_list.l_next_free_rec)
+                return;
+        for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
+                rec = &bg->bg_list.l_recs[i];
+                if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
+                        res->sr_bg_blkno = bg_blkno;  /* Restore */
+                        break;
+                }
+        }
+}
 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
                                  handle_t *handle,
                                  u32 bits_wanted,
                                  u32 min_bits,
-                                  u16 *bit_off,
+                                  struct ocfs2_suballoc_result *res,
-                                  unsigned int *num_bits,
-                                  u64 gd_blkno,
                                  u16 *bits_left)
 {
        int ret;
-        u16 found;
        struct buffer_head *group_bh = NULL;
        struct ocfs2_group_desc *gd;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
        struct inode *alloc_inode = ac->ac_inode;
-        ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
+        ret = ocfs2_read_group_descriptor(alloc_inode, di,
-                                          &group_bh);
+                                          res->sr_bg_blkno, &group_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -1424,17 +1668,18 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        gd = (struct ocfs2_group_desc *) group_bh->b_data;
        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
-                                  ac->ac_max_block, bit_off, &found);
+                                  ac->ac_max_block, res);
        if (ret < 0) {
                if (ret != -ENOSPC)
                        mlog_errno(ret);
                goto out;
        }
-        *num_bits = found;
+        if (!ret)
+                ocfs2_bg_discontig_fix_result(ac, gd, res);
        ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
-                                               *num_bits,
+                                               res->sr_bits,
                                               le16_to_cpu(gd->bg_chain));
        if (ret < 0) {
                mlog_errno(ret);
@@ -1442,7 +1687,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        }
        ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
-                                         *bit_off, *num_bits);
+                                         res->sr_bit_offset, res->sr_bits);
        if (ret < 0)
                mlog_errno(ret);
@@ -1458,13 +1703,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                              handle_t *handle,
                              u32 bits_wanted,
                              u32 min_bits,
-                              u16 *bit_off,
+                              struct ocfs2_suballoc_result *res,
-                              unsigned int *num_bits,
-                              u64 *bg_blkno,
                              u16 *bits_left)
 {
        int status;
-        u16 chain, tmp_bits;
+        u16 chain;
        u32 tmp_used;
        u64 next_group;
        struct inode *alloc_inode = ac->ac_inode;
@@ -1493,8 +1736,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
         * the 1st group with any empty bits. */
        while ((status = ac->ac_group_search(alloc_inode, group_bh,
                                             bits_wanted, min_bits,
-                                             ac->ac_max_block, bit_off,
+                                             ac->ac_max_block,
-                                             &tmp_bits)) == -ENOSPC) {
+                                             res)) == -ENOSPC) {
                if (!bg->bg_next_group)
                        break;
@@ -1519,11 +1762,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        }
        mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
-             tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
+             res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
-        *num_bits = tmp_bits;
+        res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
+        BUG_ON(res->sr_bits == 0);
+        if (!status)
+                ocfs2_bg_discontig_fix_result(ac, bg, res);
-        BUG_ON(*num_bits == 0);
        /*
         * Keep track of previous block descriptor read. When
@@ -1540,7 +1786,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
         */
        if (ac->ac_allow_chain_relink &&
            (prev_group_bh) &&
-            (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+            (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
                status = ocfs2_relink_block_group(handle, alloc_inode,
                                                  ac->ac_bh, group_bh,
                                                  prev_group_bh, chain);
@@ -1562,31 +1808,24 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        }
        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
-        fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
+        fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
-        le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
+        le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
+        ocfs2_journal_dirty(handle, ac->ac_bh);
-        status = ocfs2_journal_dirty(handle,
-                                     ac->ac_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = ocfs2_block_group_set_bits(handle,
                                            alloc_inode,
                                            bg,
                                            group_bh,
-                                            *bit_off,
+                                            res->sr_bit_offset,
-                                            *num_bits);
+                                            res->sr_bits);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
+        mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
             (unsigned long long)le64_to_cpu(fe->i_blkno));
-        *bg_blkno = le64_to_cpu(bg->bg_blkno);
        *bits_left = le16_to_cpu(bg->bg_free_bits_count);
 bail:
        brelse(group_bh);
@@ -1597,19 +1836,15 @@ bail:
 }
 /* will give out up to bits_wanted contiguous bits. */
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
-                                     struct ocfs2_alloc_context *ac,
                                     handle_t *handle,
                                     u32 bits_wanted,
                                     u32 min_bits,
-                                     u16 *bit_off,
+                                     struct ocfs2_suballoc_result *res)
-                                     unsigned int *num_bits,
-                                     u64 *bg_blkno)
 {
        int status;
        u16 victim, i;
        u16 bits_left = 0;
-        u64 hint_blkno = ac->ac_last_group;
        struct ocfs2_chain_list *cl;
        struct ocfs2_dinode *fe;
@@ -1627,7 +1862,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
            le32_to_cpu(fe->id1.bitmap1.i_total)) {
-                ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
+                ocfs2_error(ac->ac_inode->i_sb,
+                            "Chain allocator dinode %llu has %u used "
                            "bits but only %u total.",
                            (unsigned long long)le64_to_cpu(fe->i_blkno),
                            le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1636,22 +1872,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
                goto bail;
        }
-        if (hint_blkno) {
+        res->sr_bg_blkno = ac->ac_last_group;
+        if (res->sr_bg_blkno) {
                /* Attempt to short-circuit the usual search mechanism
                 * by jumping straight to the most recently used
                 * allocation group. This helps us mantain some
                 * contiguousness across allocations. */
                status = ocfs2_search_one_group(ac, handle, bits_wanted,
-                                                min_bits, bit_off, num_bits,
+                                                min_bits, res, &bits_left);
-                                                hint_blkno, &bits_left);
+                if (!status)
-                if (!status) {
-                        /* Be careful to update *bg_blkno here as the
-                         * caller is expecting it to be filled in, and
-                         * ocfs2_search_one_group() won't do that for
-                         * us. */
-                        *bg_blkno = hint_blkno;
                        goto set_hint;
-                }
                if (status < 0 && status != -ENOSPC) {
                        mlog_errno(status);
                        goto bail;
@@ -1664,8 +1894,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
        ac->ac_chain = victim;
        ac->ac_allow_chain_relink = 1;
-        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
+        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
-                                    num_bits, bg_blkno, &bits_left);
+                                    res, &bits_left);
        if (!status)
                goto set_hint;
        if (status < 0 && status != -ENOSPC) {
@@ -1689,8 +1919,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
                ac->ac_chain = i;
                status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
-                                            bit_off, num_bits, bg_blkno,
+                                            res, &bits_left);
-                                            &bits_left);
                if (!status)
                        break;
                if (status < 0 && status != -ENOSPC) {
@@ -1707,7 +1936,7 @@ set_hint:
                if (bits_left < min_bits)
                        ac->ac_last_group = 0;
                else
-                        ac->ac_last_group = *bg_blkno;
+                        ac->ac_last_group = res->sr_bg_blkno;
        }
 bail:
@@ -1715,37 +1944,37 @@ bail:
        return status;
 }
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
+int ocfs2_claim_metadata(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 bits_wanted,
+                         u64 *suballoc_loc,
                         u16 *suballoc_bit_start,
                         unsigned int *num_bits,
                         u64 *blkno_start)
 {
        int status;
-        u64 bg_blkno;
+        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
        BUG_ON(!ac);
        BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
        BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
-        status = ocfs2_claim_suballoc_bits(osb,
+        status = ocfs2_claim_suballoc_bits(ac,
-                                           ac,
                                           handle,
                                           bits_wanted,
                                           1,
-                                           suballoc_bit_start,
+                                           &res);
-                                           num_bits,
-                                           &bg_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        atomic_inc(&osb->alloc_stats.bg_allocs);
+        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
-        *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
+        *suballoc_loc = res.sr_bg_blkno;
-        ac->ac_bits_given += (*num_bits);
+        *suballoc_bit_start = res.sr_bit_offset;
+        *blkno_start = res.sr_blkno;
+        ac->ac_bits_given += res.sr_bits;
+        *num_bits = res.sr_bits;
        status = 0;
 bail:
        mlog_exit(status);
@@ -1753,10 +1982,10 @@ bail:
 }
 static void ocfs2_init_inode_ac_group(struct inode *dir,
-                                      struct buffer_head *parent_fe_bh,
+                                      struct buffer_head *parent_di_bh,
                                      struct ocfs2_alloc_context *ac)
 {
-        struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
        /*
         * Try to allocate inodes from some specific group.
         *
@@ -1770,10 +1999,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir,
        if (OCFS2_I(dir)->ip_last_used_group &&
            OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
                ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
-        else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
+        else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
-                ac->ac_last_group = ocfs2_which_suballoc_group(
+                if (di->i_suballoc_loc)
-                                        le64_to_cpu(fe->i_blkno),
+                        ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
-                                        le16_to_cpu(fe->i_suballoc_bit));
+                else
+                        ac->ac_last_group = ocfs2_which_suballoc_group(
+                                        le64_to_cpu(di->i_blkno),
+                                        le16_to_cpu(di->i_suballoc_bit));
+        }
 }
 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
@@ -1783,17 +2016,16 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
        OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
 }
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+int ocfs2_claim_new_inode(handle_t *handle,
-                          handle_t *handle,
                          struct inode *dir,
                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
+                          u64 *suballoc_loc,
                          u16 *suballoc_bit,
                          u64 *fe_blkno)
 {
        int status;
-        unsigned int num_bits;
+        struct ocfs2_suballoc_result res;
-        u64 bg_blkno;
        mlog_entry_void();
@@ -1804,23 +2036,22 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
-        status = ocfs2_claim_suballoc_bits(osb,
+        status = ocfs2_claim_suballoc_bits(ac,
-                                           ac,
                                           handle,
                                           1,
                                           1,
-                                           suballoc_bit,
+                                           &res);
-                                           &num_bits,
-                                           &bg_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        atomic_inc(&osb->alloc_stats.bg_allocs);
+        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
-        BUG_ON(num_bits != 1);
+        BUG_ON(res.sr_bits != 1);
-        *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+        *suballoc_loc = res.sr_bg_blkno;
+        *suballoc_bit = res.sr_bit_offset;
+        *fe_blkno = res.sr_blkno;
        ac->ac_bits_given++;
        ocfs2_save_inode_ac_group(dir, ac);
        status = 0;
@@ -1890,8 +2121,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 * contig. allocation, set to '1' to indicate we can deal with extents
 * of any size.
 */
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
+int __ocfs2_claim_clusters(handle_t *handle,
-                           handle_t *handle,
                           struct ocfs2_alloc_context *ac,
                           u32 min_clusters,
                           u32 max_clusters,
@@ -1900,8 +2130,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
 {
        int status;
        unsigned int bits_wanted = max_clusters;
-        u64 bg_blkno = 0;
+        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
-        u16 bg_bit_off;
+        struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
        mlog_entry_void();
@@ -1911,6 +2141,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
               && ac->ac_which != OCFS2_AC_USE_MAIN);
        if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+                WARN_ON(min_clusters > 1);
                status = ocfs2_claim_local_alloc_bits(osb,
                                                      handle,
                                                      ac,
@@ -1933,20 +2165,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
                if (bits_wanted > (osb->bitmap_cpg - 1))
                        bits_wanted = osb->bitmap_cpg - 1;
-                status = ocfs2_claim_suballoc_bits(osb,
+                status = ocfs2_claim_suballoc_bits(ac,
-                                                   ac,
                                                   handle,
                                                   bits_wanted,
                                                   min_clusters,
-                                                   &bg_bit_off,
+                                                   &res);
-                                                   num_clusters,
-                                                   &bg_blkno);
                if (!status) {
+                        BUG_ON(res.sr_blkno); /* cluster alloc can't set */
                        *cluster_start =
                                ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
-                                                                 bg_blkno,
+                                                                 res.sr_bg_blkno,
-                                                                 bg_bit_off);
+                                                                 res.sr_bit_offset);
                        atomic_inc(&osb->alloc_stats.bitmap_data);
+                        *num_clusters = res.sr_bits;
                }
        }
        if (status < 0) {
@@ -1962,8 +2193,7 @@ bail:
        return status;
 }
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
+int ocfs2_claim_clusters(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 min_clusters,
                         u32 *cluster_start,
@@ -1971,22 +2201,22 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 {
        unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
-        return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
+        return __ocfs2_claim_clusters(handle, ac, min_clusters,
                                      bits_wanted, cluster_start, num_clusters);
 }
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
+static int ocfs2_block_group_clear_bits(handle_t *handle,
-                                               struct inode *alloc_inode,
+                                        struct inode *alloc_inode,
-                                               struct ocfs2_group_desc *bg,
+                                        struct ocfs2_group_desc *bg,
-                                               struct buffer_head *group_bh,
+                                        struct buffer_head *group_bh,
-                                               unsigned int bit_off,
+                                        unsigned int bit_off,
-                                               unsigned int num_bits)
+                                        unsigned int num_bits,
+                                        void (*undo_fn)(unsigned int bit,
+                                                        unsigned long *bmap))
 {
        int status;
        unsigned int tmp;
-        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
        struct ocfs2_group_desc *undo_bg = NULL;
-        int cluster_bitmap = 0;
        mlog_entry_void();
@@ -1996,20 +2226,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
-        if (ocfs2_is_cluster_bitmap(alloc_inode))
+        BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
-                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
-                                         group_bh, journal_type);
+                                         group_bh,
+                                         undo_fn ?
+                                         OCFS2_JOURNAL_ACCESS_UNDO :
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        if (ocfs2_is_cluster_bitmap(alloc_inode))
+        if (undo_fn) {
-                cluster_bitmap = 1;
-        if (cluster_bitmap) {
                jbd_lock_bh_state(group_bh);
                undo_bg = (struct ocfs2_group_desc *)
                                        bh2jh(group_bh)->b_committed_data;
@@ -2020,18 +2248,16 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        while(tmp--) {
                ocfs2_clear_bit((bit_off + tmp),
                                (unsigned long *) bg->bg_bitmap);
-                if (cluster_bitmap)
+                if (undo_fn)
-                        ocfs2_set_bit(bit_off + tmp,
+                        undo_fn(bit_off + tmp,
-                                      (unsigned long *) undo_bg->bg_bitmap);
+                                (unsigned long *) undo_bg->bg_bitmap);
        }
        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
-        if (cluster_bitmap)
+        if (undo_fn)
                jbd_unlock_bh_state(group_bh);
-        status = ocfs2_journal_dirty(handle, group_bh);
+        ocfs2_journal_dirty(handle, group_bh);
-        if (status < 0)
-                mlog_errno(status);
 bail:
        return status;
 }
@@ -2039,12 +2265,14 @@ bail:
 /*
 * expects the suballoc inode to already be locked.
 */
-int ocfs2_free_suballoc_bits(handle_t *handle,
+static int _ocfs2_free_suballoc_bits(handle_t *handle,
-                             struct inode *alloc_inode,
+                                     struct inode *alloc_inode,
-                             struct buffer_head *alloc_bh,
+                                     struct buffer_head *alloc_bh,
-                             unsigned int start_bit,
+                                     unsigned int start_bit,
-                             u64 bg_blkno,
+                                     u64 bg_blkno,
-                             unsigned int count)
+                                     unsigned int count,
+                                     void (*undo_fn)(unsigned int bit,
+                                                     unsigned long *bitmap))
 {
        int status = 0;
        u32 tmp_used;
@@ -2079,7 +2307,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
        status = ocfs2_block_group_clear_bits(handle, alloc_inode,
                                              group, group_bh,
-                                              start_bit, count);
+                                              start_bit, count, undo_fn);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -2096,12 +2324,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
                     count);
        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
        fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
+        ocfs2_journal_dirty(handle, alloc_bh);
-        status = ocfs2_journal_dirty(handle, alloc_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
 bail:
        brelse(group_bh);
@@ -2110,6 +2333,17 @@ bail:
        return status;
 }
+int ocfs2_free_suballoc_bits(handle_t *handle,
+                             struct inode *alloc_inode,
+                             struct buffer_head *alloc_bh,
+                             unsigned int start_bit,
+                             u64 bg_blkno,
+                             unsigned int count)
+{
+        return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
+                                         start_bit, bg_blkno, count, NULL);
+}
 int ocfs2_free_dinode(handle_t *handle,
                      struct inode *inode_alloc_inode,
                      struct buffer_head *inode_alloc_bh,
@@ -2119,15 +2353,19 @@ int ocfs2_free_dinode(handle_t *handle,
        u16 bit = le16_to_cpu(di->i_suballoc_bit);
        u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        if (di->i_suballoc_loc)
+                bg_blkno = le64_to_cpu(di->i_suballoc_loc);
        return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
                                        inode_alloc_bh, bit, bg_blkno, 1);
 }
-int ocfs2_free_clusters(handle_t *handle,
+static int _ocfs2_free_clusters(handle_t *handle,
-                       struct inode *bitmap_inode,
+                                struct inode *bitmap_inode,
-                       struct buffer_head *bitmap_bh,
+                                struct buffer_head *bitmap_bh,
-                       u64 start_blk,
+                                u64 start_blk,
-                       unsigned int num_clusters)
+                                unsigned int num_clusters,
+                                void (*undo_fn)(unsigned int bit,
+                                                unsigned long *bitmap))
 {
        int status;
        u16 bg_start_bit;
@@ -2154,9 +2392,9 @@ int ocfs2_free_clusters(handle_t *handle,
        mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
             (unsigned long long)bg_blkno, bg_start_bit);
-        status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+        status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
-                                          bg_start_bit, bg_blkno,
+                                           bg_start_bit, bg_blkno,
-                                          num_clusters);
+                                           num_clusters, undo_fn);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -2170,6 +2408,32 @@ out:
        return status;
 }
+int ocfs2_free_clusters(handle_t *handle,
+                        struct inode *bitmap_inode,
+                        struct buffer_head *bitmap_bh,
+                        u64 start_blk,
+                        unsigned int num_clusters)
+{
+        return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+                                    start_blk, num_clusters,
+                                    _ocfs2_set_bit);
+}
+/*
+ * Give never-used clusters back to the global bitmap.  We don't need
+ * to protect these bits in the undo buffer.
+ */
+int ocfs2_release_clusters(handle_t *handle,
+                           struct inode *bitmap_inode,
+                           struct buffer_head *bitmap_bh,
+                           u64 start_blk,
+                           unsigned int num_clusters)
+{
+        return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+                                    start_blk, num_clusters,
+                                    _ocfs2_clear_bit);
+}
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
 {
        printk("Block Group:\n");
@@ -2360,7 +2624,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
                                   struct buffer_head *alloc_bh, u64 blkno,
                                   u16 bit, int *res)
 {
-        struct ocfs2_dinode *alloc_fe;
+        struct ocfs2_dinode *alloc_di;
        struct ocfs2_group_desc *group;
        struct buffer_head *group_bh = NULL;
        u64 bg_blkno;
@@ -2369,17 +2633,20 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
        mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
                   (unsigned int)bit);
-        alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
+        alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
-        if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+        if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
                mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
                     (unsigned int)bit,
-                     ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+                     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
                status = -EINVAL;
                goto bail;
        }
-        bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+        if (alloc_di->i_suballoc_loc)
-        status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+                bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
+        else
+                bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+        status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
                                             &group_bh);
        if (status < 0) {
                mlog(ML_ERROR, "read group %llu failed %d\n",
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index fa60723c43e8..a017dd3ee7d9 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -26,13 +26,14 @@
 #ifndef _CHAINALLOC_H_
 #define _CHAINALLOC_H_
+struct ocfs2_suballoc_result;
 typedef int (group_search_t)(struct inode *,
                             struct buffer_head *,
                             u32,                       /* bits_wanted */
                             u32,                       /* min_bits */
                             u64,                       /* max_block */
-                             u16 *,                     /* *bit_off */
+                             struct ocfs2_suballoc_result *);
-                             u16 *);                    /* *bits_found */
+                                                        /* found bits */
 struct ocfs2_alloc_context {
        struct inode *ac_inode;    /* which bitmap are we allocating from? */
@@ -54,6 +55,8 @@ struct ocfs2_alloc_context {
        u64    ac_last_group;
        u64    ac_max_block;  /* Highest block number to allocate. 0 is
                                 is the same as ~0 - unlimited */
+        struct ocfs2_alloc_reservation  *ac_resv;
 };
 void ocfs2_init_steal_slots(struct ocfs2_super *osb);
@@ -80,22 +83,21 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
                           u32 bits_wanted,
                           struct ocfs2_alloc_context **ac);
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
+int ocfs2_claim_metadata(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 bits_wanted,
+                         u64 *suballoc_loc,
                         u16 *suballoc_bit_start,
                         u32 *num_bits,
                         u64 *blkno_start);
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+int ocfs2_claim_new_inode(handle_t *handle,
-                          handle_t *handle,
                          struct inode *dir,
                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
+                          u64 *suballoc_loc,
                          u16 *suballoc_bit,
                          u64 *fe_blkno);
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
+int ocfs2_claim_clusters(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 min_clusters,
                         u32 *cluster_start,
@@ -104,8 +106,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 * Use this variant of ocfs2_claim_clusters to specify a maxiumum
 * number of clusters smaller than the allocation reserved.
 */
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
+int __ocfs2_claim_clusters(handle_t *handle,
-                           handle_t *handle,
                           struct ocfs2_alloc_context *ac,
                           u32 min_clusters,
                           u32 max_clusters,
@@ -127,6 +128,11 @@ int ocfs2_free_clusters(handle_t *handle,
                        struct buffer_head *bitmap_bh,
                        u64 start_blk,
                        unsigned int num_clusters);
+int ocfs2_release_clusters(handle_t *handle,
+                           struct inode *bitmap_inode,
+                           struct buffer_head *bitmap_bh,
+                           u64 start_blk,
+                           unsigned int num_clusters);
 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
 {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index dee03197a494..1c2c39f6f0b6 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -94,7 +94,9 @@ struct mount_options
        unsigned long   mount_opt;
        unsigned int    atime_quantum;
        signed short    slot;
-        unsigned int    localalloc_opt;
+        int             localalloc_opt;
+        unsigned int    resv_level;
+        int             dir_resv_level;
        char            cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 };
@@ -176,6 +178,8 @@ enum {
        Opt_noacl,
        Opt_usrquota,
        Opt_grpquota,
+        Opt_resv_level,
+        Opt_dir_resv_level,
        Opt_err,
 };
@@ -202,6 +206,8 @@ static const match_table_t tokens = {
        {Opt_noacl, "noacl"},
        {Opt_usrquota, "usrquota"},
        {Opt_grpquota, "grpquota"},
+        {Opt_resv_level, "resv_level=%u"},
+        {Opt_dir_resv_level, "dir_resv_level=%u"},
        {Opt_err, NULL}
 };
@@ -1028,8 +1034,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
        osb->osb_commit_interval = parsed_options.commit_interval;
-        osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
-        osb->local_alloc_bits = osb->local_alloc_default_bits;
+        ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
+        osb->osb_resv_level = parsed_options.resv_level;
+        osb->osb_dir_resv_level = parsed_options.resv_level;
+        if (parsed_options.dir_resv_level == -1)
+                osb->osb_dir_resv_level = parsed_options.resv_level;
+        else
+                osb->osb_dir_resv_level = parsed_options.dir_resv_level;
        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
        if (status)
@@ -1285,11 +1297,13 @@ static int ocfs2_parse_options(struct super_block *sb,
                   options ? options : "(none)");
        mopt->commit_interval = 0;
-        mopt->mount_opt = 0;
+        mopt->mount_opt = OCFS2_MOUNT_NOINTR;
        mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
        mopt->slot = OCFS2_INVALID_SLOT;
-        mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+        mopt->localalloc_opt = -1;
        mopt->cluster_stack[0] = '\0';
+        mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+        mopt->dir_resv_level = -1;
        if (!options) {
                status = 1;
@@ -1380,7 +1394,7 @@ static int ocfs2_parse_options(struct super_block *sb,
                                status = 0;
                                goto bail;
                        }
-                        if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+                        if (option >= 0)
                                mopt->localalloc_opt = option;
                        break;
                case Opt_localflocks:
@@ -1433,6 +1447,28 @@ static int ocfs2_parse_options(struct super_block *sb,
                        mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
                        mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
                        break;
+                case Opt_resv_level:
+                        if (is_remount)
+                                break;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option >= OCFS2_MIN_RESV_LEVEL &&
+                            option < OCFS2_MAX_RESV_LEVEL)
+                                mopt->resv_level = option;
+                        break;
+                case Opt_dir_resv_level:
+                        if (is_remount)
+                                break;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option >= OCFS2_MIN_RESV_LEVEL &&
+                            option < OCFS2_MAX_RESV_LEVEL)
+                                mopt->dir_resv_level = option;
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1487,7 +1523,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                           (unsigned) (osb->osb_commit_interval / HZ));
        local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
-        if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+        if (local_alloc_megs != ocfs2_la_default_mb(osb))
                seq_printf(s, ",localalloc=%d", local_alloc_megs);
        if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -1514,6 +1550,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        else
                seq_printf(s, ",noacl");
+        if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
+                seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+        if (osb->osb_dir_resv_level != osb->osb_resv_level)
+                seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
        return 0;
 }
@@ -1688,6 +1730,8 @@ static void ocfs2_inode_init_once(void *data)
        oi->ip_blkno = 0ULL;
        oi->ip_clusters = 0;
+        ocfs2_resv_init_once(&oi->ip_la_data_resv);
        ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
        ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
        ocfs2_lock_res_init_once(&oi->ip_open_lockres);
@@ -2042,6 +2086,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
        init_waitqueue_head(&osb->osb_mount_event);
+        status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
+        if (status) {
+                mlog_errno(status);
+                goto bail;
+        }
        osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
        if (!osb->vol_label) {
                mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -2224,9 +2274,11 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+        osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
        iput(inode);
-        osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
+        osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
+                                 osb->s_feature_incompat) * 8;
        status = ocfs2_init_slot_info(osb);
        if (status < 0) {
@@ -2509,5 +2561,25 @@ void __ocfs2_abort(struct super_block* sb,
        ocfs2_handle_error(sb);
 }
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset)
+{
+        int rc;
+        sigset_t blocked;
+        sigfillset(&blocked);
+        rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
+        BUG_ON(rc);
+}
+void ocfs2_unblock_signals(sigset_t *oldset)
+{
+        int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
+        BUG_ON(rc);
+}
 module_init(ocfs2_init);
 module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..40c7de084c10 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,11 @@ void __ocfs2_abort(struct super_block *sb,
 #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset);
+void ocfs2_unblock_signals(sigset_t *oldset);
 #endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 40e53702948c..bfe7190cdbf1 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,7 +25,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #define MLOG_MASK_PREFIX ML_INODE
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d1b0d386f6d1..98ee6c44102d 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt {
        struct ocfs2_alloc_context *meta_ac;
        struct ocfs2_alloc_context *data_ac;
        struct ocfs2_cached_dealloc_ctxt dealloc;
+        int set_abort;
 };
 #define OCFS2_XATTR_ROOT_SIZE   (sizeof(struct ocfs2_xattr_def_value_root))
@@ -739,11 +740,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, vb->vb_bh);
+        ocfs2_journal_dirty(handle, vb->vb_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
@@ -786,12 +783,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
        }
        le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
+        ocfs2_journal_dirty(handle, vb->vb_bh);
-        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        if (ext_flags & OCFS2_EXT_REFCOUNTED)
                ret = ocfs2_decrease_refcount(inode, handle,
@@ -1374,11 +1366,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                                memset(bh->b_data + cp_len, 0,
                                       blocksize - cp_len);
-                        ret = ocfs2_journal_dirty(handle, bh);
+                        ocfs2_journal_dirty(handle, bh);
-                        if (ret < 0) {
-                                mlog_errno(ret);
-                                goto out;
-                        }
                        brelse(bh);
                        bh = NULL;
@@ -1622,7 +1610,7 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
        /* Now tell xh->xh_entries about it */
        for (i = 0; i < count; i++) {
                offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
-                if (offset < namevalue_offset)
+                if (offset <= namevalue_offset)
                        le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
                                     namevalue_size);
        }
@@ -2148,15 +2136,19 @@ alloc_value:
                orig_clusters = ocfs2_xa_value_clusters(loc);
                rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
                if (rc < 0) {
-                        /*
+                        ctxt->set_abort = 1;
-                         * If we tried to grow an existing external value,
-                         * ocfs2_xa_cleanuP-value_truncate() is going to
-                         * let it stand.  We have to restore its original
-                         * value size.
-                         */
-                        loc->xl_entry->xe_value_size = orig_value_size;
                        ocfs2_xa_cleanup_value_truncate(loc, "growing",
                                                        orig_clusters);
+                        /*
+                         * If we were growing an existing value,
+                         * ocfs2_xa_cleanup_value_truncate() won't remove
+                         * the entry. We need to restore the original value
+                         * size.
+                         */
+                        if (loc->xl_entry) {
+                                BUG_ON(!orig_value_size);
+                                loc->xl_entry->xe_value_size = orig_value_size;
+                        }
                        mlog_errno(rc);
                }
        }
@@ -2479,7 +2471,10 @@ static int ocfs2_xattr_free_block(struct inode *inode,
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        blk = le64_to_cpu(xb->xb_blkno);
        bit = le16_to_cpu(xb->xb_suballoc_bit);
-        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        if (xb->xb_suballoc_loc)
+                bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
+        else
+                bg_blkno = ocfs2_which_suballoc_group(blk, bit);
        xb_alloc_inode = ocfs2_get_system_file_inode(osb,
                                EXTENT_ALLOC_SYSTEM_INODE,
@@ -2594,9 +2589,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
-        ret = ocfs2_journal_dirty(handle, di_bh);
+        ocfs2_journal_dirty(handle, di_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out_commit:
        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
@@ -2724,9 +2717,7 @@ static int ocfs2_xattr_ibody_init(struct inode *inode,
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
-        ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
+        ocfs2_journal_dirty(ctxt->handle, di_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out:
        return ret;
@@ -2846,9 +2837,8 @@ static int ocfs2_create_xattr_block(struct inode *inode,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 first_blkno;
+        u64 suballoc_loc, first_blkno;
        struct ocfs2_dinode *di =  (struct ocfs2_dinode *)inode_bh->b_data;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct buffer_head *new_bh = NULL;
        struct ocfs2_xattr_block *xblk;
@@ -2859,9 +2849,9 @@ static int ocfs2_create_xattr_block(struct inode *inode,
                goto end;
        }
-        ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
+        ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
-                                   &suballoc_bit_start, &num_got,
+                                   &suballoc_loc, &suballoc_bit_start,
-                                   &first_blkno);
+                                   &num_got, &first_blkno);
        if (ret < 0) {
                mlog_errno(ret);
                goto end;
@@ -2883,8 +2873,10 @@ static int ocfs2_create_xattr_block(struct inode *inode,
        memset(xblk, 0, inode->i_sb->s_blocksize);
        strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
        xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
+        xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
        xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
-        xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+        xblk->xb_fs_generation =
+                cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
        xblk->xb_blkno = cpu_to_le64(first_blkno);
        if (indexed) {
                struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
@@ -2956,7 +2948,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
                ret = ocfs2_xa_set(&loc, xi, ctxt);
                if (!ret)
                        xs->here = loc.xl_entry;
-                else if (ret != -ENOSPC)
+                else if ((ret != -ENOSPC) || ctxt->set_abort)
                        goto end;
                else {
                        ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
@@ -3312,14 +3304,13 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                goto out;
                        }
-                        ret = ocfs2_extend_trans(ctxt->handle, credits +
+                        ret = ocfs2_extend_trans(ctxt->handle, credits);
-                                        ctxt->handle->h_buffer_credits);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
-                } else if (ret == -ENOSPC) {
+                } else if ((ret == -ENOSPC) && !ctxt->set_abort) {
                        if (di->i_xattr_loc && !xbs->xattr_bh) {
                                ret = ocfs2_xattr_block_find(inode,
                                                             xi->xi_name_index,
@@ -3343,8 +3334,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                        goto out;
                                }
-                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                ret = ocfs2_extend_trans(ctxt->handle, credits);
-                                        ctxt->handle->h_buffer_credits);
                                if (ret) {
                                        mlog_errno(ret);
                                        goto out;
@@ -3378,8 +3368,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                        goto out;
                                }
-                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                ret = ocfs2_extend_trans(ctxt->handle, credits);
-                                                ctxt->handle->h_buffer_credits);
                                if (ret) {
                                        mlog_errno(ret);
                                        goto out;
@@ -4249,7 +4238,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        u32 bit_off, len;
        u64 blkno;
        handle_t *handle = ctxt->handle;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct buffer_head *xb_bh = xs->xattr_bh;
        struct ocfs2_xattr_block *xb =
@@ -4277,7 +4265,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
                goto out;
        }
-        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+        ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
                                     1, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
@@ -4887,8 +4875,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
         * We need to update the first bucket of the old extent and all
         * the buckets going to the new extent.
         */
-        credits = ((num_buckets + 1) * blks_per_bucket) +
+        credits = ((num_buckets + 1) * blks_per_bucket);
-                handle->h_buffer_credits;
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -4958,7 +4945,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
                                      u32 *first_hash)
 {
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
+        int ret, credits = 2 * blk_per_bucket;
        BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
@@ -5099,7 +5086,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                goto leave;
        }
-        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
+        ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
                                     clusters_to_add, &bit_off, &num_bits);
        if (ret < 0) {
                if (ret != -ENOSPC)
@@ -5153,9 +5140,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                goto leave;
        }
-        ret = ocfs2_journal_dirty(handle, root_bh);
+        ocfs2_journal_dirty(handle, root_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 leave:
        return ret;
@@ -5200,8 +5185,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
         * existing bucket.  Then we add the last existing bucket, the
         * new bucket, and the first bucket (3 * blk_per_bucket).
         */
-        credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
+        credits = (end_blk - target_blk) + (3 * blk_per_bucket);
-                  handle->h_buffer_credits;
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -5477,12 +5461,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
        }
        le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+        ocfs2_journal_dirty(handle, root_bh);
-        ret = ocfs2_journal_dirty(handle, root_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
        if (ret)
@@ -6528,13 +6507,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
                                          int indexed)
 {
        int ret;
-        struct ocfs2_alloc_context *meta_ac;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_xattr_set_ctxt ctxt = {
+        struct ocfs2_xattr_set_ctxt ctxt;
-                .meta_ac = meta_ac,
-        };
-        ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+        memset(&ctxt, 0, sizeof(ctxt));
+        ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -6556,7 +6533,7 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
        ocfs2_commit_trans(osb, ctxt.handle);
 out:
-        ocfs2_free_alloc_context(meta_ac);
+        ocfs2_free_alloc_context(ctxt.meta_ac);
        return ret;
 }
@@ -6937,7 +6914,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_claim_clusters(osb, handle, data_ac,
+        ret = ocfs2_claim_clusters(handle, data_ac,
                                   len, &p_cluster, &num_clusters);
        if (ret) {
                mlog_errno(ret);
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 75d9b5ba1d45..b44bb835e8ea 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -3,9 +3,9 @@
 * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
 * Released under GPL v2.
 */
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
diff --git a/fs/open.c b/fs/open.c
index e17f54454b50..74e5cd9f718e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -10,7 +10,6 @@
 #include <linux/fdtable.h>
 #include <linux/fsnotify.h>
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/tty.h>
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
@@ -20,6 +19,7 @@
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/fcntl.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/fs.h>
 #include <linux/personality.h>
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e8865c11777f..e238ab23a9e7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 49cfd5f54238..91babdae7587 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -95,6 +95,7 @@
 ************************************************************/
 #include <linux/crc32.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 #include "check.h"
 #include "efi.h"
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..90be97f1f5a8 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
 */
 #include <asm/unaligned.h>
-#define SYS_IND(p)      (get_unaligned(&p->sys_ind))
+#define SYS_IND(p)      get_unaligned(&p->sys_ind)
-#define NR_SECTS(p)     ({ __le32 __a = get_unaligned(&p->nr_sects);    \
-                                le32_to_cpu(__a); \
-                        })
-#define START_SECT(p)   ({ __le32 __a = get_unaligned(&p->start_sect);  \
+static inline sector_t nr_sects(struct partition *p)
-                                le32_to_cpu(__a); \
+{
-                        })
+        return (sector_t)get_unaligned_le32(&p->nr_sects);
+}
+static inline sector_t start_sect(struct partition *p)
+{
+        return (sector_t)get_unaligned_le32(&p->start_sect);
+}
 static inline int is_extended_partition(struct partition *p)
 {
@@ -104,13 +107,13 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
 static void
 parse_extended(struct parsed_partitions *state, struct block_device *bdev,
-                        u32 first_sector, u32 first_size)
+                        sector_t first_sector, sector_t first_size)
 {
        struct partition *p;
        Sector sect;
        unsigned char *data;
-        u32 this_sector, this_size;
+        sector_t this_sector, this_size;
-        int sector_size = bdev_logical_block_size(bdev) / 512;
+        sector_t sector_size = bdev_logical_block_size(bdev) / 512;
        int loopct = 0;         /* number of links followed
                                   without finding a data partition */
        int i;
@@ -145,14 +148,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
                 * First process the data partition(s)
                 */
                for (i=0; i<4; i++, p++) {
-                        u32 offs, size, next;
+                        sector_t offs, size, next;
-                        if (!NR_SECTS(p) || is_extended_partition(p))
+                        if (!nr_sects(p) || is_extended_partition(p))
                                continue;
                        /* Check the 3rd and 4th entries -
                           these sometimes contain random garbage */
-                        offs = START_SECT(p)*sector_size;
+                        offs = start_sect(p)*sector_size;
-                        size = NR_SECTS(p)*sector_size;
+                        size = nr_sects(p)*sector_size;
                        next = this_sector + offs;
                        if (i >= 2) {
                                if (offs + size > this_size)
@@ -179,13 +182,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
                 */
                p -= 4;
                for (i=0; i<4; i++, p++)
-                        if (NR_SECTS(p) && is_extended_partition(p))
+                        if (nr_sects(p) && is_extended_partition(p))
                                break;
                if (i == 4)
                        goto done;       /* nothing left to do */
-                this_sector = first_sector + START_SECT(p) * sector_size;
+                this_sector = first_sector + start_sect(p) * sector_size;
-                this_size = NR_SECTS(p) * sector_size;
+                this_size = nr_sects(p) * sector_size;
                put_dev_sector(sect);
        }
 done:
@@ -197,7 +200,7 @@ done:
 static void
 parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
-                        u32 offset, u32 size, int origin)
+                        sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_SOLARIS_X86_PARTITION
        Sector sect;
@@ -244,7 +247,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
 */
 static void
 parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
-                u32 offset, u32 size, int origin, char *flavour,
+                sector_t offset, sector_t size, int origin, char *flavour,
                int max_partitions)
 {
        Sector sect;
@@ -263,7 +266,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
        if (le16_to_cpu(l->d_npartitions) < max_partitions)
                max_partitions = le16_to_cpu(l->d_npartitions);
        for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
-                u32 bsd_start, bsd_size;
+                sector_t bsd_start, bsd_size;
                if (state->next == state->limit)
                        break;
@@ -290,7 +293,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
 static void
 parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
-                u32 offset, u32 size, int origin)
+                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
        parse_bsd(state, bdev, offset, size, origin,
@@ -300,7 +303,7 @@ parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
 static void
 parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
-                u32 offset, u32 size, int origin)
+                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
        parse_bsd(state, bdev, offset, size, origin,
@@ -310,7 +313,7 @@ parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
 static void
 parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
-                u32 offset, u32 size, int origin)
+                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
        parse_bsd(state, bdev, offset, size, origin,
@@ -324,7 +327,7 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
 */
 static void
 parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
-                u32 offset, u32 size, int origin)
+                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_UNIXWARE_DISKLABEL
        Sector sect;
@@ -348,7 +351,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
                if (p->s_label != UNIXWARE_FS_UNUSED)
                        put_partition(state, state->next++,
-                                                START_SECT(p), NR_SECTS(p));
+                                      le32_to_cpu(p->start_sect),
+                                      le32_to_cpu(p->nr_sects));
                p++;
        }
        put_dev_sector(sect);
@@ -363,7 +367,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
 */
 static void
 parse_minix(struct parsed_partitions *state, struct block_device *bdev,
-                u32 offset, u32 size, int origin)
+                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_MINIX_SUBPARTITION
        Sector sect;
@@ -390,7 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
                        /* add each partition in use */
                        if (SYS_IND(p) == MINIX_PARTITION)
                                put_partition(state, state->next++,
-                                              START_SECT(p), NR_SECTS(p));
+                                              start_sect(p), nr_sects(p));
                }
                printk(" >\n");
        }
@@ -401,7 +405,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 static struct {
        unsigned char id;
        void (*parse)(struct parsed_partitions *, struct block_device *,
-                        u32, u32, int);
+                        sector_t, sector_t, int);
 } subtypes[] = {
        {FREEBSD_PARTITION, parse_freebsd},
        {NETBSD_PARTITION, parse_netbsd},
@@ -415,7 +419,7 @@ static struct {
 
 int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-        int sector_size = bdev_logical_block_size(bdev) / 512;
+        sector_t sector_size = bdev_logical_block_size(bdev) / 512;
        Sector sect;
        unsigned char *data;
        struct partition *p;
@@ -483,14 +487,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
        state->next = 5;
        for (slot = 1 ; slot <= 4 ; slot++, p++) {
-                u32 start = START_SECT(p)*sector_size;
+                sector_t start = start_sect(p)*sector_size;
-                u32 size = NR_SECTS(p)*sector_size;
+                sector_t size = nr_sects(p)*sector_size;
                if (!size)
                        continue;
                if (is_extended_partition(p)) {
-                        /* prevent someone doing mkfs or mkswap on an
+                        /*
-                           extended partition, but leave room for LILO */
+                         * prevent someone doing mkfs or mkswap on an
-                        put_partition(state, slot, start, size == 1 ? 1 : 2);
+                         * extended partition, but leave room for LILO
+                         * FIXME: this uses one logical sector for > 512b
+                         * sector, although it may not be enough/proper.
+                         */
+                        sector_t n = 2;
+                        n = min(size, max(sector_size, n));
+                        put_partition(state, slot, start, n);
                        printk(" <");
                        parse_extended(state, bdev, start, size);
                        printk(" >");
@@ -513,7 +524,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                unsigned char id = SYS_IND(p);
                int n;
-                if (!NR_SECTS(p))
+                if (!nr_sects(p))
                        continue;
                for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +532,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                if (!subtypes[n].parse)
                        continue;
-                subtypes[n].parse(state, bdev, START_SECT(p)*sector_size,
+                subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
-                                                NR_SECTS(p)*sector_size, slot);
+                                                nr_sects(p)*sector_size, slot);
        }
        put_dev_sector(sect);
        return 1;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index aa8637b81028..885ab5513ac5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -68,7 +68,6 @@
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/signal.h>
 #include <linux/highmem.h>
@@ -82,7 +81,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
-#include <linux/swapops.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -496,7 +494,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                rsslim,
                mm ? mm->start_code : 0,
                mm ? mm->end_code : 0,
-                (permitted && mm) ? task->stack_start : 0,
+                (permitted && mm) ? mm->start_stack : 0,
                esp,
                eip,
                /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a7310841c831..c7f9f23449dc 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,6 +81,7 @@
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
+#include <linux/slab.h>
 #include "internal.h"
 /* NOTE:
@@ -442,12 +443,13 @@ static const struct file_operations proc_lstats_operations = {
 unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
-        unsigned long points;
+        unsigned long points = 0;
        struct timespec uptime;
        do_posix_clock_monotonic_gettime(&uptime);
        read_lock(&tasklist_lock);
-        points = badness(task->group_leader, uptime.tv_sec);
+        if (pid_alive(task))
+                points = badness(task, uptime.tv_sec);
        read_unlock(&tasklist_lock);
        return sprintf(buffer, "%lu\n", points);
 }
@@ -728,6 +730,7 @@ out_no_task:
 static const struct file_operations proc_info_file_operations = {
        .read           = proc_info_read,
+        .llseek         = generic_file_llseek,
 };
 static int proc_single_show(struct seq_file *m, void *v)
@@ -985,6 +988,7 @@ out_no_task:
 static const struct file_operations proc_environ_operations = {
        .read           = environ_read,
+        .llseek         = generic_file_llseek,
 };
 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
@@ -1058,6 +1062,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 static const struct file_operations proc_oom_adjust_operations = {
        .read           = oom_adjust_read,
        .write          = oom_adjust_write,
+        .llseek         = generic_file_llseek,
 };
 #ifdef CONFIG_AUDITSYSCALL
@@ -1129,6 +1134,7 @@ out_free_page:
 static const struct file_operations proc_loginuid_operations = {
        .read           = proc_loginuid_read,
        .write          = proc_loginuid_write,
+        .llseek         = generic_file_llseek,
 };
 static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
@@ -1149,6 +1155,7 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
 static const struct file_operations proc_sessionid_operations = {
        .read           = proc_sessionid_read,
+        .llseek         = generic_file_llseek,
 };
 #endif
@@ -1200,6 +1207,7 @@ static ssize_t proc_fault_inject_write(struct file * file,
 static const struct file_operations proc_fault_inject_operations = {
        .read           = proc_fault_inject_read,
        .write          = proc_fault_inject_write,
+        .llseek         = generic_file_llseek,
 };
 #endif
@@ -1941,7 +1949,7 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
 }
 static const struct file_operations proc_fdinfo_file_operations = {
-        .open           = nonseekable_open,
+        .open           = nonseekable_open,
        .read           = proc_fdinfo_read,
 };
@@ -2225,6 +2233,7 @@ out_no_task:
 static const struct file_operations proc_pid_attr_operations = {
        .read           = proc_pid_attr_read,
        .write          = proc_pid_attr_write,
+        .llseek         = generic_file_llseek,
 };
 static const struct pid_entry attr_dir_stuff[] = {
@@ -2345,6 +2354,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,
 static const struct file_operations proc_coredump_filter_operations = {
        .read           = proc_coredump_filter_read,
        .write          = proc_coredump_filter_write,
+        .llseek         = generic_file_llseek,
 };
 #endif
@@ -2907,7 +2917,7 @@ out_no_task:
 */
 static const struct pid_entry tid_base_stuff[] = {
        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
+        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        REG("environ",   S_IRUSR, proc_environ_operations),
        INF("auxv",      S_IRUSR, proc_pid_auxv),
        ONE("status",    S_IRUGO, proc_pid_status),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 08f4d71dacd7..43c127490606 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -13,6 +13,7 @@
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/idr.h>
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 445a02bcaab3..aea8502e58a3 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/smp_lock.h>
 #include <linux/sysctl.h>
+#include <linux/slab.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -231,9 +232,9 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
                if (rv == -ENOIOCTLCMD)
                        rv = -EINVAL;
        } else if (ioctl) {
-                lock_kernel();
+                WARN_ONCE(1, "Procfs ioctl handlers must use unlocked_ioctl, "
+                          "%pf will be called without the Bkl held\n", ioctl);
                rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
-                unlock_kernel();
        }
        pde_users_dec(pde);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4d..c837a77351be 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -19,6 +19,7 @@
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <linux/list.h>
@@ -490,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
                }
                read_unlock(&kclist_lock);
-                if (m == NULL) {
+                if (&m->list == &kclist_head) {
                        if (clear_user(buffer, tsz))
                                return -EFAULT;
                } else if (is_vmalloc_or_module_addr((void *)start)) {
@@ -557,6 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
 static const struct file_operations proc_kcore_operations = {
        .read           = read_kcore,
        .open           = open_kcore,
+        .llseek         = generic_file_llseek,
 };
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index cfe90a48a6e8..bd4b5a740ff1 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -53,6 +53,7 @@ static const struct file_operations proc_kmsg_operations = {
        .poll           = kmsg_poll,
        .open           = kmsg_open,
        .release        = kmsg_release,
+        .llseek         = generic_file_llseek,
 };
 static int __init proc_kmsg_init(void)
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 9fe7d7ebe115..b1822dde55c2 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
 #include <linux/mmzone.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index f8650dce74fb..ce94801f48ca 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -12,6 +12,7 @@
 #include <linux/string.h>
 #include <linux/of.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 04d1270f1c38..9020ac15baaa 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -14,6 +14,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/module.h>
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b9b7aad2003d..bf31b03fc275 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,6 +1,5 @@
 #include <linux/cpumask.h>
 #include <linux/fs.h>
-#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 183f8ff5f400..47f5b145f56e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
 #include <linux/seq_file.h>
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
@@ -246,25 +247,6 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
                                } else if (vma->vm_start <= mm->start_stack &&
                                           vma->vm_end >= mm->start_stack) {
                                        name = "[stack]";
-                                } else {
-                                        unsigned long stack_start;
-                                        struct proc_maps_private *pmp;
-                                        pmp = m->private;
-                                        stack_start = pmp->task->stack_start;
-                                        if (vma->vm_start <= stack_start &&
-                                            vma->vm_end >= stack_start) {
-                                                pad_len_spaces(m, len);
-                                                seq_printf(m,
-                                                 "[threadstack:%08lx]",
-#ifdef CONFIG_STACK_GROWSUP
-                                                 vma->vm_end - stack_start
-#else
-                                                 stack_start - vma->vm_start
-#endif
-                                                );
-                                        }
                                }
                        } else {
                                name = "[vdso]";
@@ -406,6 +388,7 @@ static int show_smap(struct seq_file *m, void *v)
        memset(&mss, 0, sizeof mss);
        mss.vma = vma;
+        /* mmap_sem is held in m_start */
        if (vma->vm_mm && !is_vm_hugetlb_page(vma))
                walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
@@ -552,7 +535,8 @@ const struct file_operations proc_clear_refs_operations = {
 };
 struct pagemapread {
-        u64 __user *out, *end;
+        int pos, len;
+        u64 *buffer;
 };
 #define PM_ENTRY_BYTES      sizeof(u64)
@@ -575,10 +559,8 @@ struct pagemapread {
 static int add_to_pagemap(unsigned long addr, u64 pfn,
                          struct pagemapread *pm)
 {
-        if (put_user(pfn, pm->out))
+        pm->buffer[pm->pos++] = pfn;
-                return -EFAULT;
+        if (pm->pos >= pm->len)
-        pm->out++;
-        if (pm->out >= pm->end)
                return PM_END_OF_BUFFER;
        return 0;
 }
@@ -661,31 +643,18 @@ static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
        return pme;
 }
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
+/* This function walks within one hugetlb entry in the single call */
-                                 unsigned long end, struct mm_walk *walk)
+static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+                                 unsigned long addr, unsigned long end,
+                                 struct mm_walk *walk)
 {
-        struct vm_area_struct *vma;
        struct pagemapread *pm = walk->private;
-        struct hstate *hs = NULL;
        int err = 0;
+        u64 pfn;
-        vma = find_vma(walk->mm, addr);
-        if (vma)
-                hs = hstate_vma(vma);
        for (; addr != end; addr += PAGE_SIZE) {
-                u64 pfn = PM_NOT_PRESENT;
+                int offset = (addr & ~hmask) >> PAGE_SHIFT;
+                pfn = huge_pte_to_pagemap_entry(*pte, offset);
-                if (vma && (addr >= vma->vm_end)) {
-                        vma = find_vma(walk->mm, addr);
-                        if (vma)
-                                hs = hstate_vma(vma);
-                }
-                if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) {
-                        /* calculate pfn of the "raw" page in the hugepage. */
-                        int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT;
-                        pfn = huge_pte_to_pagemap_entry(*pte, offset);
-                }
                err = add_to_pagemap(addr, pfn, pm);
                if (err)
                        return err;
@@ -720,21 +689,20 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
 * determine which areas of memory are actually mapped and llseek to
 * skip over unmapped regions.
 */
+#define PAGEMAP_WALK_SIZE       (PMD_SIZE)
 static ssize_t pagemap_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
 {
        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-        struct page **pages, *page;
-        unsigned long uaddr, uend;
        struct mm_struct *mm;
        struct pagemapread pm;
-        int pagecount;
        int ret = -ESRCH;
        struct mm_walk pagemap_walk = {};
        unsigned long src;
        unsigned long svpfn;
        unsigned long start_vaddr;
        unsigned long end_vaddr;
+        int copied = 0;
        if (!task)
                goto out;
@@ -757,35 +725,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!mm)
                goto out_task;
+        pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
-        uaddr = (unsigned long)buf & PAGE_MASK;
+        pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
-        uend = (unsigned long)(buf + count);
-        pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
-        ret = 0;
-        if (pagecount == 0)
-                goto out_mm;
-        pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
        ret = -ENOMEM;
-        if (!pages)
+        if (!pm.buffer)
                goto out_mm;
-        down_read(&current->mm->mmap_sem);
-        ret = get_user_pages(current, current->mm, uaddr, pagecount,
-                             1, 0, pages, NULL);
-        up_read(&current->mm->mmap_sem);
-        if (ret < 0)
-                goto out_free;
-        if (ret != pagecount) {
-                pagecount = ret;
-                ret = -EFAULT;
-                goto out_pages;
-        }
-        pm.out = (u64 __user *)buf;
-        pm.end = (u64 __user *)(buf + count);
        pagemap_walk.pmd_entry = pagemap_pte_range;
        pagemap_walk.pte_hole = pagemap_pte_hole;
        pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -807,23 +752,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
         * user buffer is tracked in "pm", and the walk
         * will stop when we hit the end of the buffer.
         */
-        ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk);
+        ret = 0;
-        if (ret == PM_END_OF_BUFFER)
+        while (count && (start_vaddr < end_vaddr)) {
-                ret = 0;
+                int len;
-        /* don't need mmap_sem for these, but this looks cleaner */
+                unsigned long end;
-        *ppos += (char __user *)pm.out - buf;
-        if (!ret)
+                pm.pos = 0;
-                ret = (char __user *)pm.out - buf;
+                end = start_vaddr + PAGEMAP_WALK_SIZE;
+                /* overflow ? */
-out_pages:
+                if (end < start_vaddr || end > end_vaddr)
-        for (; pagecount; pagecount--) {
+                        end = end_vaddr;
-                page = pages[pagecount-1];
+                down_read(&mm->mmap_sem);
-                if (!PageReserved(page))
+                ret = walk_page_range(start_vaddr, end, &pagemap_walk);
-                        SetPageDirty(page);
+                up_read(&mm->mmap_sem);
-                page_cache_release(page);
+                start_vaddr = end;
+                len = min(count, PM_ENTRY_BYTES * pm.pos);
+                if (copy_to_user(buf, pm.buffer, len)) {
+                        ret = -EFAULT;
+                        goto out_free;
+                }
+                copied += len;
+                buf += len;
+                count -= len;
        }
+        *ppos += copied;
+        if (!ret || ret == PM_END_OF_BUFFER)
+                ret = copied;
 out_free:
-        kfree(pages);
+        kfree(pm.buffer);
 out_mm:
        mmput(mm);
 out_task:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d9fd64ef81a..46d4b5d72bd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -5,6 +5,7 @@
 #include <linux/fs_struct.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include "internal.h"
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0872afa58d39..91c817ff02c3 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
 #include <linux/user.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
+#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
@@ -162,6 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 static const struct file_operations proc_vmcore_operations = {
        .read           = read_vmcore,
+        .llseek         = generic_file_llseek,
 };
 static struct vmcore* __init get_new_element(void)
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index ebf3440d28ca..277575ddc05c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -201,7 +201,8 @@ static const char *qnx4_checkroot(struct super_block *sb)
                                rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
                                if (rootdir->di_fname != NULL) {
                                        QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
-                                        if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) {
+                                        if (!strcmp(rootdir->di_fname,
+                                                    QNX4_BMNAME)) {
                                                found = 1;
                                                qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
                                                if (!qnx4_sb(sb)->BitMap) {
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index dad7fb247ddc..3e21b1e2ad3a 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -33,6 +33,14 @@ config PRINT_QUOTA_WARNING
          Note that this behavior is currently deprecated and may go away in
          future. Please use notification via netlink socket instead.
+config QUOTA_DEBUG
+        bool "Additional quota sanity checks"
+        depends on QUOTA
+        default n
+        help
+          If you say Y here, quota subsystem will perform some additional
+          sanity checks of quota internal structures. If unsure, say N.
 # Generic support for tree structured quota files. Selected when needed.
 config QUOTA_TREE
         tristate
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index e0b870f4749f..788b5802a7ce 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -80,8 +80,6 @@
 #include <asm/uaccess.h>
-#define __DQUOT_PARANOIA
 /*
 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
 * and quota formats, dqstats structure containing statistics about the lists
@@ -695,7 +693,7 @@ void dqput(struct dquot *dquot)
        if (!dquot)
                return;
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
        if (!atomic_read(&dquot->dq_count)) {
                printk("VFS: dqput: trying to free free dquot\n");
                printk("VFS: device %s, dquot of %s %d\n",
@@ -748,7 +746,7 @@ we_slept:
                goto we_slept;
        }
        atomic_dec(&dquot->dq_count);
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
        /* sanity check */
        BUG_ON(!list_empty(&dquot->dq_free));
 #endif
@@ -845,7 +843,7 @@ we_slept:
                dquot = NULL;
                goto out;
        }
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
        BUG_ON(!dquot->dq_sb);  /* Has somebody invalidated entry under us? */
 #endif
 out:
@@ -874,14 +872,18 @@ static int dqinit_needed(struct inode *inode, int type)
 static void add_dquot_ref(struct super_block *sb, int type)
 {
        struct inode *inode, *old_inode = NULL;
+#ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
+#endif
        spin_lock(&inode_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
                        continue;
+#ifdef CONFIG_QUOTA_DEBUG
                if (unlikely(inode_get_rsv_space(inode) > 0))
                        reserved = 1;
+#endif
                if (!atomic_read(&inode->i_writecount))
                        continue;
                if (!dqinit_needed(inode, type))
@@ -903,11 +905,13 @@ static void add_dquot_ref(struct super_block *sb, int type)
        spin_unlock(&inode_lock);
        iput(old_inode);
+#ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                printk(KERN_WARNING "VFS (%s): Writes happened before quota"
                        " was turned on thus quota information is probably "
                        "inconsistent. Please run quotacheck(8).\n", sb->s_id);
        }
+#endif
 }
 /*
@@ -934,7 +938,7 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
        inode->i_dquot[type] = NULL;
        if (dquot) {
                if (dqput_blocks(dquot)) {
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
                        if (atomic_read(&dquot->dq_count) != 1)
                                printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
 #endif
@@ -2322,34 +2326,34 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
        if (di->dqb_valid & QIF_SPACE) {
                dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
                check_blim = 1;
-                __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+                set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_BLIMITS) {
                dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
                dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
                check_blim = 1;
-                __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+                set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_INODES) {
                dm->dqb_curinodes = di->dqb_curinodes;
                check_ilim = 1;
-                __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+                set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_ILIMITS) {
                dm->dqb_isoftlimit = di->dqb_isoftlimit;
                dm->dqb_ihardlimit = di->dqb_ihardlimit;
                check_ilim = 1;
-                __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+                set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_BTIME) {
                dm->dqb_btime = di->dqb_btime;
                check_blim = 1;
-                __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+                set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_ITIME) {
                dm->dqb_itime = di->dqb_itime;
                check_ilim = 1;
-                __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+                set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
        }
        if (check_blim) {
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 2663ed90fb03..d67908b407d9 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -5,6 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/quotaops.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 1739a4aba25f..5ea4ad81a429 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -21,6 +21,7 @@
 #include <linux/pagevec.h>
 #include <linux/mman.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a6090aa1a7c1..f47cd212dee1 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,6 +35,7 @@
 #include <linux/sched.h>
 #include <linux/parser.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -213,7 +214,7 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
        return 0;
 }
-static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
+int ramfs_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct ramfs_fs_info *fsi;
        struct inode *inode = NULL;
diff --git a/fs/read_write.c b/fs/read_write.c
index b7f4a1f94d48..113386d6fd2d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -258,6 +258,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
        kiocb.ki_left = len;
+        kiocb.ki_nbytes = len;
        for (;;) {
                ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +314,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
        kiocb.ki_left = len;
+        kiocb.ki_nbytes = len;
        for (;;) {
                ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index dc014f7def05..483442e66ed6 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -169,7 +169,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
                        return 0;       // No free blocks in this bitmap
                }
-                /* search for a first zero bit -- beggining of a window */
+                /* search for a first zero bit -- beginning of a window */
                *beg = reiserfs_find_next_zero_le_bit
                    ((unsigned long *)(bh->b_data), boundary, *beg);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index c094f58c7448..07930449a958 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -8,6 +8,7 @@
 #include <linux/reiserfs_fs.h>
 #include <linux/stat.h>
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 extern const struct reiserfs_key MIN_KEY;
@@ -45,8 +46,6 @@ static inline bool is_privroot_deh(struct dentry *dir,
                                   struct reiserfs_de_head *deh)
 {
        struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
-        if (reiserfs_expose_privroot(dir->d_sb))
-                return 0;
        return (dir == dir->d_parent && privroot->d_inode &&
                deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 6591cb21edf6..1e4250bc3a6f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -35,6 +35,7 @@
 **/
 #include <linux/time.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/buffer_head.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d1da94b82d8f..dc2c65e04853 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,6 +11,7 @@
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 #include <linux/buffer_head.h>
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ba98546fabbd..19fbc810e8e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -50,6 +50,7 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 #include <asm/system.h>
@@ -2217,6 +2218,15 @@ static int journal_read_transaction(struct super_block *sb,
                brelse(d_bh);
                return 1;
        }
+        if (bdev_read_only(sb->s_bdev)) {
+                reiserfs_warning(sb, "clm-2076",
+                                 "device is readonly, unable to replay log");
+                brelse(c_bh);
+                brelse(d_bh);
+                return -EROFS;
+        }
        trans_id = get_desc_trans_id(desc);
        /* now we know we've got a good transaction, and it was inside the valid time ranges */
        log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2459,12 +2469,6 @@ static int journal_read(struct super_block *sb)
                goto start_log_replay;
        }
-        if (continue_replay && bdev_read_only(sb->s_bdev)) {
-                reiserfs_warning(sb, "clm-2076",
-                                 "device is readonly, unable to replay log");
-                return -1;
-        }
        /* ok, there are transactions that need to be replayed.  start with the first log block, find
         ** all the valid transactions, and pick out the oldest.
         */
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 96e4cbbfaa18..d0c43cb99ffc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -13,6 +13,7 @@
 #include <linux/time.h>
 #include <linux/bitops.h>
+#include <linux/slab.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 04bf5d791bda..59125fb36d42 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -12,6 +12,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/time.h>
 #include <asm/uaccess.h>
@@ -1618,10 +1619,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        save_mount_options(s, data);
        sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
-        if (!sbi) {
+        if (!sbi)
-                errval = -ENOMEM;
+                return -ENOMEM;
-                goto error_alloc;
-        }
        s->s_fs_info = sbi;
        /* Set default values for options: non-aggressive tails, RO on errors */
        REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
@@ -1878,12 +1877,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        return (0);
 error:
-        reiserfs_write_unlock(s);
-error_alloc:
        if (jinit_done) {       /* kill the commit thread, free journal ram */
                journal_release_error(NULL, s);
        }
+        reiserfs_write_unlock(s);
        reiserfs_free_bitmap_cache(s);
        if (SB_BUFFER_WITH_SB(s))
                brelse(SB_BUFFER_WITH_SB(s));
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 37d034ca7d99..e7cc00e636dc 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -38,6 +38,7 @@
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/errno.h>
+#include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/pagemap.h>
@@ -553,7 +554,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
        if (!err && new_size < i_size_read(dentry->d_inode)) {
                struct iattr newattrs = {
                        .ia_ctime = current_fs_time(inode->i_sb),
-                        .ia_size = buffer_size,
+                        .ia_size = new_size,
                        .ia_valid = ATTR_SIZE | ATTR_CTIME,
                };
@@ -972,21 +973,13 @@ int reiserfs_permission(struct inode *inode, int mask)
        return generic_permission(inode, mask, NULL);
 }
-/* This will catch lookups from the fs root to .reiserfs_priv */
+static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
-static int
-xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
 {
-        struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
+        return -EPERM;
-        if (container_of(q1, struct dentry, d_name) == priv_root)
-                return -ENOENT;
-        if (q1->len == name->len &&
-                   !memcmp(q1->name, name->name, name->len))
-                return 0;
-        return 1;
 }
 static const struct dentry_operations xattr_lookup_poison_ops = {
-        .d_compare = xattr_lookup_poison,
+        .d_revalidate = xattr_hide_revalidate,
 };
 int reiserfs_lookup_privroot(struct super_block *s)
@@ -1000,8 +993,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
                REISERFS_SB(s)->priv_root = dentry;
-                if (!reiserfs_expose_privroot(s))
+                dentry->d_op = &xattr_lookup_poison_ops;
-                        s->s_root->d_op = &xattr_lookup_poison_ops;
                if (dentry->d_inode)
                        dentry->d_inode->i_flags |= S_PRIVATE;
        } else
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index dd20a7883f0f..9cdb759645a9 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -5,6 +5,7 @@
 #include <linux/errno.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/reiserfs_acl.h>
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index d8b5bfcbdd30..7271a477c041 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/security.h>
 #include <asm/uaccess.h>
@@ -76,7 +77,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
                return error;
        }
-        if (sec->length) {
+        if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
                blocks = reiserfs_xattr_jcreate_nblocks(inode) +
                         reiserfs_xattr_nblocks(inode, sec->length);
                /* We don't want to count the directories twice if we have
diff --git a/fs/select.c b/fs/select.c
index 73715e90030f..500a669f7790 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -691,6 +691,23 @@ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
 }
 #endif /* HAVE_SET_RESTORE_SIGMASK */
+#ifdef __ARCH_WANT_SYS_OLD_SELECT
+struct sel_arg_struct {
+        unsigned long n;
+        fd_set __user *inp, *outp, *exp;
+        struct timeval __user *tvp;
+};
+SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
+{
+        struct sel_arg_struct a;
+        if (copy_from_user(&a, arg, sizeof(a)))
+                return -EFAULT;
+        return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
+}
+#endif
 struct poll_list {
        struct poll_list *next;
        int len;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1dabe4ee02fe..f329849ce3c0 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/list.h>
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 92d5e8ffb639..dbf6548bbf06 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -13,7 +13,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/net.h>
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1c4c8f089970..dfa1d67f8fca 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -479,6 +479,7 @@ smb_put_super(struct super_block *sb)
        if (server->conn_pid)
                kill_pid(server->conn_pid, SIGTERM, 1);
+        bdi_destroy(&server->bdi);
        kfree(server->ops);
        smb_unload_nls(server);
        sb->s_fs_info = NULL;
@@ -525,6 +526,11 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
        if (!server)
                goto out_no_server;
        sb->s_fs_info = server;
+        
+        if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
+                goto out_bdi;
+        sb->s_bdi = &server->bdi;
        server->super_block = sb;
        server->mnt = NULL;
@@ -624,6 +630,8 @@ out_no_smbiod:
 out_bad_option:
        kfree(mem);
 out_no_mem:
+        bdi_destroy(&server->bdi);
+out_bdi:
        if (!server->mnt)
                printk(KERN_ERR "smb_fill_super: allocation failure\n");
        sb->s_fs_info = NULL;
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 6bd9b691a463..0e39a924f10a 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -12,7 +12,6 @@
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/dcache.h>
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 00b2909bd469..54350b59046b 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,6 +15,7 @@
 #include <linux/pagemap.h>
 #include <linux/net.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 39208663aaf1..9313b6124a2e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -30,6 +30,7 @@
 #include <linux/syscalls.h>
 #include <linux/uio.h>
 #include <linux/security.h>
+#include <linux/gfp.h>
 /*
 * Attempt to steal a page from a pipe buffer. This should perhaps go into
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 1cb0d81b164b..653c030eb840 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -87,9 +87,8 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
        u64 cur_index = index >> msblk->devblksize_log2;
        int bytes, compressed, b = 0, k = 0, page = 0, avail;
+        bh = kcalloc(((srclength + msblk->devblksize - 1)
-        bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1,
+                >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
-                                sizeof(*bh), GFP_KERNEL);
        if (bh == NULL)
                return -ENOMEM;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 3550aec2f655..48b6f4a385a6 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -275,7 +275,8 @@ allocate_root:
        err = squashfs_read_inode(root, root_inode);
        if (err) {
-                iget_failed(root);
+                make_bad_inode(root);
+                iput(root);
                goto failed_mount;
        }
        insert_inode_hash(root);
@@ -353,6 +354,7 @@ static void squashfs_put_super(struct super_block *sb)
                kfree(sbi->id_table);
                kfree(sbi->fragment_index);
                kfree(sbi->meta_index);
+                kfree(sbi->inode_lookup_table);
                kfree(sb->s_fs_info);
                sb->s_fs_info = NULL;
        }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index e80be2022a7f..32b911f4ee39 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -33,7 +33,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 4dd70e04333b..7a603874e483 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -24,6 +24,7 @@
 #include <linux/mutex.h>
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <linux/zlib.h>
 #include "squashfs_fs.h"
@@ -127,8 +128,9 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
                goto release_mutex;
        }
+        length = stream->total_out;
        mutex_unlock(&msblk->read_data_mutex);
-        return stream->total_out;
+        return length;
 release_mutex:
        mutex_unlock(&msblk->read_data_mutex);
diff --git a/fs/super.c b/fs/super.c
index f35ac6022109..1527e6a0ee35 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,6 +37,7 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
+#include <linux/backing-dev.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -693,6 +694,7 @@ int set_anon_super(struct super_block *s, void *data)
                return -EMFILE;
        }
        s->s_dev = MKDEV(0, dev & MINORMASK);
+        s->s_bdi = &noop_backing_dev_info;
        return 0;
 }
@@ -954,10 +956,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
        if (error < 0)
                goto out_free_secdata;
        BUG_ON(!mnt->mnt_sb);
+        WARN_ON(!mnt->mnt_sb->s_bdi);
-        error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
+        error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
-        if (error)
+        if (error)
-                goto out_sb;
+                goto out_sb;
        /*
         * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
diff --git a/fs/sync.c b/fs/sync.c
index f557d71cb097..92b228176f7c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -5,6 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
@@ -13,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
 #include "internal.h"
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
@@ -31,7 +33,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
         * This should be safe, as we require bdi backing to actually
         * write out data in the first place
         */
-        if (!sb->s_bdi)
+        if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
                return 0;
        if (sb->s_qcop && sb->s_qcop->quota_sync)
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index a0a500af24a1..806b277453f9 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -54,14 +54,14 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
        int rc;
        /* need attr_sd for attr, its parent for kobj */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -ENODEV;
        rc = -EIO;
        if (attr->read)
                rc = attr->read(kobj, attr, buffer, off, count);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return rc;
 }
@@ -125,14 +125,14 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
        int rc;
        /* need attr_sd for attr, its parent for kobj */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -ENODEV;
        rc = -EIO;
        if (attr->write)
                rc = attr->write(kobj, attr, buffer, offset, count);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return rc;
 }
@@ -184,12 +184,12 @@ static void bin_vma_open(struct vm_area_struct *vma)
        if (!bb->vm_ops || !bb->vm_ops->open)
                return;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return;
        bb->vm_ops->open(vma);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
 }
 static void bin_vma_close(struct vm_area_struct *vma)
@@ -201,12 +201,12 @@ static void bin_vma_close(struct vm_area_struct *vma)
        if (!bb->vm_ops || !bb->vm_ops->close)
                return;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return;
        bb->vm_ops->close(vma);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
 }
 static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -219,12 +219,12 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (!bb->vm_ops || !bb->vm_ops->fault)
                return VM_FAULT_SIGBUS;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return VM_FAULT_SIGBUS;
        ret = bb->vm_ops->fault(vma, vmf);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return ret;
 }
@@ -241,12 +241,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (!bb->vm_ops->page_mkwrite)
                return 0;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return VM_FAULT_SIGBUS;
        ret = bb->vm_ops->page_mkwrite(vma, vmf);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return ret;
 }
@@ -261,12 +261,12 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
        if (!bb->vm_ops || !bb->vm_ops->access)
                return -EINVAL;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -EINVAL;
        ret = bb->vm_ops->access(vma, addr, buf, len, write);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return ret;
 }
@@ -281,12 +281,12 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
        if (!bb->vm_ops || !bb->vm_ops->set_policy)
                return 0;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -EINVAL;
        ret = bb->vm_ops->set_policy(vma, new);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return ret;
 }
@@ -301,12 +301,12 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
        if (!bb->vm_ops || !bb->vm_ops->get_policy)
                return vma->vm_policy;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return vma->vm_policy;
        pol = bb->vm_ops->get_policy(vma, addr);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return pol;
 }
@@ -321,12 +321,12 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
        if (!bb->vm_ops || !bb->vm_ops->migrate)
                return 0;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return 0;
        ret = bb->vm_ops->migrate(vma, from, to, flags);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return ret;
 }
 #endif
@@ -356,7 +356,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
        /* need attr_sd for attr, its parent for kobj */
        rc = -ENODEV;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                goto out_unlock;
        rc = -EINVAL;
@@ -384,7 +384,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
        bb->vm_ops = vma->vm_ops;
        vma->vm_ops = &bin_vm_ops;
 out_put:
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
 out_unlock:
        mutex_unlock(&bb->mutex);
@@ -399,7 +399,7 @@ static int open(struct inode * inode, struct file * file)
        int error;
        /* binary file operations requires both @sd and its parent */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -ENODEV;
        error = -EACCES;
@@ -426,11 +426,11 @@ static int open(struct inode * inode, struct file * file)
        mutex_unlock(&sysfs_bin_lock);
        /* open succeeded, put active references */
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return 0;
 err_out:
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        kfree(bb);
        return error;
 }
@@ -501,7 +501,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
 void sysfs_remove_bin_file(struct kobject *kobj,
                           const struct bin_attribute *attr)
 {
-        sysfs_hash_and_remove(kobj->sd, attr->attr.name);
+        sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 699f371b9f12..b2b83067ccc8 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -93,7 +93,7 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 *      RETURNS:
 *      Pointer to @sd on success, NULL on failure.
 */
-static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 {
        if (unlikely(!sd))
                return NULL;
@@ -124,7 +124,7 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 *      Put an active reference to @sd.  This function is noop if @sd
 *      is NULL.
 */
-static void sysfs_put_active(struct sysfs_dirent *sd)
+void sysfs_put_active(struct sysfs_dirent *sd)
 {
        struct completion *cmpl;
        int v;
@@ -145,45 +145,6 @@ static void sysfs_put_active(struct sysfs_dirent *sd)
 }
 /**
- *      sysfs_get_active_two - get active references to sysfs_dirent and parent
- *      @sd: sysfs_dirent of interest
- *
- *      Get active reference to @sd and its parent.  Parent's active
- *      reference is grabbed first.  This function is noop if @sd is
- *      NULL.
- *
- *      RETURNS:
- *      Pointer to @sd on success, NULL on failure.
- */
-struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd)
-{
-        if (sd) {
-                if (sd->s_parent && unlikely(!sysfs_get_active(sd->s_parent)))
-                        return NULL;
-                if (unlikely(!sysfs_get_active(sd))) {
-                        sysfs_put_active(sd->s_parent);
-                        return NULL;
-                }
-        }
-        return sd;
-}
-/**
- *      sysfs_put_active_two - put active references to sysfs_dirent and parent
- *      @sd: sysfs_dirent of interest
- *
- *      Put active references to @sd and its parent.  This function is
- *      noop if @sd is NULL.
- */
-void sysfs_put_active_two(struct sysfs_dirent *sd)
-{
-        if (sd) {
-                sysfs_put_active(sd);
-                sysfs_put_active(sd->s_parent);
-        }
-}
-/**
 *      sysfs_deactivate - deactivate sysfs_dirent
 *      @sd: sysfs_dirent to deactivate
 *
@@ -195,6 +156,10 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
        int v;
        BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
+        if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
+                return;
        sd->s_sibling = (void *)&wait;
        rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
@@ -354,7 +319,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
        atomic_set(&sd->s_count, 1);
        atomic_set(&sd->s_active, 0);
-        sysfs_dirent_init_lockdep(sd);
        sd->s_name = name;
        sd->s_mode = mode;
@@ -416,9 +380,15 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
        struct sysfs_inode_attrs *ps_iattr;
-        if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
+        if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
                return -EEXIST;
+        if (sysfs_ns_type(acxt->parent_sd) && !sd->s_ns) {
+                WARN(1, KERN_WARNING "sysfs: ns required in '%s' for '%s'\n",
+                        acxt->parent_sd->s_name, sd->s_name);
+                return -EINVAL;
+        }
        sd->s_parent = sysfs_get(acxt->parent_sd);
        sysfs_link_sibling(sd);
@@ -569,13 +539,17 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 *      Pointer to sysfs_dirent if found, NULL if not.
 */
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+                                       const void *ns,
                                       const unsigned char *name)
 {
        struct sysfs_dirent *sd;
-        for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling)
+        for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
+                if (sd->s_ns != ns)
+                        continue;
                if (!strcmp(sd->s_name, name))
                        return sd;
+        }
        return NULL;
 }
@@ -594,12 +568,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 *      Pointer to sysfs_dirent if found, NULL if not.
 */
 struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+                                      const void *ns,
                                      const unsigned char *name)
 {
        struct sysfs_dirent *sd;
        mutex_lock(&sysfs_mutex);
-        sd = sysfs_find_dirent(parent_sd, name);
+        sd = sysfs_find_dirent(parent_sd, ns, name);
        sysfs_get(sd);
        mutex_unlock(&sysfs_mutex);
@@ -608,7 +583,8 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
 EXPORT_SYMBOL_GPL(sysfs_get_dirent);
 static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-                      const char *name, struct sysfs_dirent **p_sd)
+        enum kobj_ns_type type, const void *ns, const char *name,
+        struct sysfs_dirent **p_sd)
 {
        umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
        struct sysfs_addrm_cxt acxt;
@@ -619,6 +595,9 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
        sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
        if (!sd)
                return -ENOMEM;
+        sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
+        sd->s_ns = ns;
        sd->s_dir.kobj = kobj;
        /* link in */
@@ -637,7 +616,25 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
                        struct sysfs_dirent **p_sd)
 {
-        return create_dir(kobj, kobj->sd, name, p_sd);
+        return create_dir(kobj, kobj->sd,
+                          KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
+}
+static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
+{
+        const struct kobj_ns_type_operations *ops;
+        enum kobj_ns_type type;
+        ops = kobj_child_ns_ops(kobj);
+        if (!ops)
+                return KOBJ_NS_TYPE_NONE;
+        type = ops->type;
+        BUG_ON(type <= KOBJ_NS_TYPE_NONE);
+        BUG_ON(type >= KOBJ_NS_TYPES);
+        BUG_ON(!kobj_ns_type_registered(type));
+        return type;
 }
 /**
@@ -646,7 +643,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 */
 int sysfs_create_dir(struct kobject * kobj)
 {
+        enum kobj_ns_type type;
        struct sysfs_dirent *parent_sd, *sd;
+        const void *ns = NULL;
        int error = 0;
        BUG_ON(!kobj);
@@ -656,7 +655,11 @@ int sysfs_create_dir(struct kobject * kobj)
        else
                parent_sd = &sysfs_root;
-        error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd);
+        if (sysfs_ns_type(parent_sd))
+                ns = kobj->ktype->namespace(kobj);
+        type = sysfs_read_ns_type(kobj);
+        error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
        if (!error)
                kobj->sd = sd;
        return error;
@@ -666,13 +669,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
                                struct nameidata *nd)
 {
        struct dentry *ret = NULL;
-        struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
+        struct dentry *parent = dentry->d_parent;
+        struct sysfs_dirent *parent_sd = parent->d_fsdata;
        struct sysfs_dirent *sd;
        struct inode *inode;
+        enum kobj_ns_type type;
+        const void *ns;
        mutex_lock(&sysfs_mutex);
-        sd = sysfs_find_dirent(parent_sd, dentry->d_name.name);
+        type = sysfs_ns_type(parent_sd);
+        ns = sysfs_info(dir->i_sb)->ns[type];
+        sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
        /* no such entry */
        if (!sd) {
@@ -681,7 +690,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
        }
        /* attach dentry and inode */
-        inode = sysfs_get_inode(sd);
+        inode = sysfs_get_inode(dir->i_sb, sd);
        if (!inode) {
                ret = ERR_PTR(-ENOMEM);
                goto out_unlock;
@@ -771,7 +780,8 @@ void sysfs_remove_dir(struct kobject * kobj)
 }
 int sysfs_rename(struct sysfs_dirent *sd,
-        struct sysfs_dirent *new_parent_sd, const char *new_name)
+        struct sysfs_dirent *new_parent_sd, const void *new_ns,
+        const char *new_name)
 {
        const char *dup_name = NULL;
        int error;
@@ -779,12 +789,12 @@ int sysfs_rename(struct sysfs_dirent *sd,
        mutex_lock(&sysfs_mutex);
        error = 0;
-        if ((sd->s_parent == new_parent_sd) &&
+        if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
            (strcmp(sd->s_name, new_name) == 0))
                goto out;       /* nothing to rename */
        error = -EEXIST;
-        if (sysfs_find_dirent(new_parent_sd, new_name))
+        if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
                goto out;
        /* rename sysfs_dirent */
@@ -806,6 +816,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
                sd->s_parent = new_parent_sd;
                sysfs_link_sibling(sd);
        }
+        sd->s_ns = new_ns;
        error = 0;
 out:
@@ -816,19 +827,28 @@ int sysfs_rename(struct sysfs_dirent *sd,
 int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
 {
-        return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name);
+        struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
+        const void *new_ns = NULL;
+        if (sysfs_ns_type(parent_sd))
+                new_ns = kobj->ktype->namespace(kobj);
+        return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
 }
 int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 {
        struct sysfs_dirent *sd = kobj->sd;
        struct sysfs_dirent *new_parent_sd;
+        const void *new_ns = NULL;
        BUG_ON(!sd->s_parent);
+        if (sysfs_ns_type(sd->s_parent))
+                new_ns = kobj->ktype->namespace(kobj);
        new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
                new_parent_kobj->sd : &sysfs_root;
-        return sysfs_rename(sd, new_parent_sd, sd->s_name);
+        return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
 }
 /* Relationship between s_mode and the DT_xxx types */
@@ -837,13 +857,56 @@ static inline unsigned char dt_type(struct sysfs_dirent *sd)
        return (sd->s_mode >> 12) & 15;
 }
+static int sysfs_dir_release(struct inode *inode, struct file *filp)
+{
+        sysfs_put(filp->private_data);
+        return 0;
+}
+static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
+        struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
+{
+        if (pos) {
+                int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
+                        pos->s_parent == parent_sd &&
+                        ino == pos->s_ino;
+                sysfs_put(pos);
+                if (!valid)
+                        pos = NULL;
+        }
+        if (!pos && (ino > 1) && (ino < INT_MAX)) {
+                pos = parent_sd->s_dir.children;
+                while (pos && (ino > pos->s_ino))
+                        pos = pos->s_sibling;
+        }
+        while (pos && pos->s_ns != ns)
+                pos = pos->s_sibling;
+        return pos;
+}
+static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
+        struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
+{
+        pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
+        if (pos)
+                pos = pos->s_sibling;
+        while (pos && pos->s_ns != ns)
+                pos = pos->s_sibling;
+        return pos;
+}
 static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
        struct dentry *dentry = filp->f_path.dentry;
        struct sysfs_dirent * parent_sd = dentry->d_fsdata;
-        struct sysfs_dirent *pos;
+        struct sysfs_dirent *pos = filp->private_data;
+        enum kobj_ns_type type;
+        const void *ns;
        ino_t ino;
+        type = sysfs_ns_type(parent_sd);
+        ns = sysfs_info(dentry->d_sb)->ns[type];
        if (filp->f_pos == 0) {
                ino = parent_sd->s_ino;
                if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
@@ -857,29 +920,31 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
                if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
                        filp->f_pos++;
        }
-        if ((filp->f_pos > 1) && (filp->f_pos < INT_MAX)) {
+        mutex_lock(&sysfs_mutex);
-                mutex_lock(&sysfs_mutex);
+        for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
+             pos;
-                /* Skip the dentries we have already reported */
+             pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
-                pos = parent_sd->s_dir.children;
+                const char * name;
-                while (pos && (filp->f_pos > pos->s_ino))
+                unsigned int type;
-                        pos = pos->s_sibling;
+                int len, ret;
-                for ( ; pos; pos = pos->s_sibling) {
+                name = pos->s_name;
-                        const char * name;
+                len = strlen(name);
-                        int len;
+                ino = pos->s_ino;
+                type = dt_type(pos);
-                        name = pos->s_name;
+                filp->f_pos = ino;
-                        len = strlen(name);
+                filp->private_data = sysfs_get(pos);
-                        filp->f_pos = ino = pos->s_ino;
-                        if (filldir(dirent, name, len, filp->f_pos, ino,
-                                         dt_type(pos)) < 0)
-                                break;
-                }
-                if (!pos)
-                        filp->f_pos = INT_MAX;
                mutex_unlock(&sysfs_mutex);
+                ret = filldir(dirent, name, len, filp->f_pos, ino, type);
+                mutex_lock(&sysfs_mutex);
+                if (ret < 0)
+                        break;
+        }
+        mutex_unlock(&sysfs_mutex);
+        if ((filp->f_pos > 1) && !pos) { /* EOF */
+                filp->f_pos = INT_MAX;
+                filp->private_data = NULL;
        }
        return 0;
 }
@@ -888,5 +953,6 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 const struct file_operations sysfs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = sysfs_readdir,
+        .release        = sysfs_dir_release,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dc30d9e31683..1beaa739d0a6 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -53,7 +53,7 @@ struct sysfs_buffer {
        size_t                  count;
        loff_t                  pos;
        char                    * page;
-        struct sysfs_ops        * ops;
+        const struct sysfs_ops  * ops;
        struct mutex            mutex;
        int                     needs_read_fill;
        int                     event;
@@ -75,7 +75,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
 {
        struct sysfs_dirent *attr_sd = dentry->d_fsdata;
        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-        struct sysfs_ops * ops = buffer->ops;
+        const struct sysfs_ops * ops = buffer->ops;
        int ret = 0;
        ssize_t count;
@@ -85,13 +85,13 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
                return -ENOMEM;
        /* need attr_sd for attr and ops, its parent for kobj */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -ENODEV;
        buffer->event = atomic_read(&attr_sd->s_attr.open->event);
        count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        /*
         * The code works fine with PAGE_SIZE return but it's likely to
@@ -199,16 +199,16 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t
 {
        struct sysfs_dirent *attr_sd = dentry->d_fsdata;
        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-        struct sysfs_ops * ops = buffer->ops;
+        const struct sysfs_ops * ops = buffer->ops;
        int rc;
        /* need attr_sd for attr and ops, its parent for kobj */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -ENODEV;
        rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return rc;
 }
@@ -335,7 +335,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
        struct sysfs_buffer *buffer;
-        struct sysfs_ops *ops;
+        const struct sysfs_ops *ops;
        int error = -EACCES;
        char *p;
@@ -344,7 +344,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
                memmove(last_sysfs_file, p, strlen(p) + 1);
        /* need attr_sd for attr and ops, its parent for kobj */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -ENODEV;
        /* every kobject with an attribute needs a ktype assigned */
@@ -393,13 +393,13 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
                goto err_free;
        /* open succeeded, put active references */
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return 0;
 err_free:
        kfree(buffer);
 err_out:
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return error;
 }
@@ -437,12 +437,12 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
        struct sysfs_open_dirent *od = attr_sd->s_attr.open;
        /* need parent for the kobj, grab both */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                goto trigger;
        poll_wait(filp, &od->poll, wait);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        if (buffer->event != atomic_read(&od->event))
                goto trigger;
@@ -478,9 +478,12 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
        mutex_lock(&sysfs_mutex);
        if (sd && dir)
-                sd = sysfs_find_dirent(sd, dir);
+                /* Only directories are tagged, so no need to pass
+                 * a tag explicitly.
+                 */
+                sd = sysfs_find_dirent(sd, NULL, dir);
        if (sd && attr)
-                sd = sysfs_find_dirent(sd, attr);
+                sd = sysfs_find_dirent(sd, NULL, attr);
        if (sd)
                sysfs_notify_dirent(sd);
@@ -509,6 +512,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
        if (!sd)
                return -ENOMEM;
        sd->s_attr.attr = (void *)attr;
+        sysfs_dirent_init_lockdep(sd);
        sysfs_addrm_start(&acxt, dir_sd);
        rc = sysfs_add_one(&acxt, sd);
@@ -542,6 +546,18 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
 }
+int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
+{
+        int err = 0;
+        int i;
+        for (i = 0; ptr[i] && !err; i++)
+                err = sysfs_create_file(kobj, ptr[i]);
+        if (err)
+                while (--i >= 0)
+                        sysfs_remove_file(kobj, ptr[i]);
+        return err;
+}
 /**
 * sysfs_add_file_to_group - add an attribute file to a pre-existing group.
@@ -556,7 +572,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
        int error;
        if (group)
-                dir_sd = sysfs_get_dirent(kobj->sd, group);
+                dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
        else
                dir_sd = sysfs_get(kobj->sd);
@@ -586,7 +602,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
        mutex_lock(&sysfs_mutex);
        rc = -ENOENT;
-        sd = sysfs_find_dirent(kobj->sd, attr->name);
+        sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
        if (!sd)
                goto out;
@@ -611,9 +627,15 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 {
-        sysfs_hash_and_remove(kobj->sd, attr->name);
+        sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
 }
+void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
+{
+        int i;
+        for (i = 0; ptr[i]; i++)
+                sysfs_remove_file(kobj, ptr[i]);
+}
 /**
 * sysfs_remove_file_from_group - remove an attribute file from a group.
@@ -627,11 +649,11 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
        struct sysfs_dirent *dir_sd;
        if (group)
-                dir_sd = sysfs_get_dirent(kobj->sd, group);
+                dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
        else
                dir_sd = sysfs_get(kobj->sd);
        if (dir_sd) {
-                sysfs_hash_and_remove(dir_sd, attr->name);
+                sysfs_hash_and_remove(dir_sd, NULL, attr->name);
                sysfs_put(dir_sd);
        }
 }
@@ -732,3 +754,5 @@ EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
 EXPORT_SYMBOL_GPL(sysfs_create_file);
 EXPORT_SYMBOL_GPL(sysfs_remove_file);
+EXPORT_SYMBOL_GPL(sysfs_remove_files);
+EXPORT_SYMBOL_GPL(sysfs_create_files);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index fe611949a7f7..23c1e598792a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -23,7 +23,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
        int i;
        for (i = 0, attr = grp->attrs; *attr; i++, attr++)
-                sysfs_hash_and_remove(dir_sd, (*attr)->name);
+                sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
 }
 static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
@@ -39,7 +39,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
                 * visibility.  Do this by first removing then
                 * re-adding (if required) the file */
                if (update)
-                        sysfs_hash_and_remove(dir_sd, (*attr)->name);
+                        sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
                if (grp->is_visible) {
                        mode = grp->is_visible(kobj, *attr, i);
                        if (!mode)
@@ -132,7 +132,7 @@ void sysfs_remove_group(struct kobject * kobj,
        struct sysfs_dirent *sd;
        if (grp->name) {
-                sd = sysfs_get_dirent(dir_sd, grp->name);
+                sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
                if (!sd) {
                        WARN(!sd, KERN_WARNING "sysfs group %p not found for "
                                "kobject '%s'\n", grp, kobject_name(kobj));
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 6a06a1d1ea7b..cf2bad1462ea 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include "sysfs.h"
@@ -111,20 +112,20 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
        if (!sd)
                return -EINVAL;
+        mutex_lock(&sysfs_mutex);
        error = inode_change_ok(inode, iattr);
        if (error)
-                return error;
+                goto out;
        iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
        error = inode_setattr(inode, iattr);
        if (error)
-                return error;
+                goto out;
-        mutex_lock(&sysfs_mutex);
        error = sysfs_sd_setattr(sd, iattr);
+out:
        mutex_unlock(&sysfs_mutex);
        return error;
 }
@@ -283,6 +284,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 /**
 *      sysfs_get_inode - get inode for sysfs_dirent
+ *      @sb: super block
 *      @sd: sysfs_dirent to allocate inode for
 *
 *      Get inode for @sd.  If such inode doesn't exist, a new inode
@@ -295,11 +297,11 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 *      RETURNS:
 *      Pointer to allocated inode on success, NULL on failure.
 */
-struct inode * sysfs_get_inode(struct sysfs_dirent *sd)
+struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
 {
        struct inode *inode;
-        inode = iget_locked(sysfs_sb, sd->s_ino);
+        inode = iget_locked(sb, sd->s_ino);
        if (inode && (inode->i_state & I_NEW))
                sysfs_init_inode(sd, inode);
@@ -322,7 +324,7 @@ void sysfs_delete_inode(struct inode *inode)
        sysfs_put(sd);
 }
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name)
 {
        struct sysfs_addrm_cxt acxt;
        struct sysfs_dirent *sd;
@@ -332,7 +334,7 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
        sysfs_addrm_start(&acxt, dir_sd);
-        sd = sysfs_find_dirent(dir_sd, name);
+        sd = sysfs_find_dirent(dir_sd, ns, name);
        if (sd)
                sysfs_remove_one(&acxt, sd);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 49749955ccaf..1afa32ba242c 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -18,12 +18,12 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 #include "sysfs.h"
 static struct vfsmount *sysfs_mount;
-struct super_block * sysfs_sb = NULL;
 struct kmem_cache *sysfs_dir_cachep;
 static const struct super_operations sysfs_ops = {
@@ -35,7 +35,7 @@ static const struct super_operations sysfs_ops = {
 struct sysfs_dirent sysfs_root = {
        .s_name         = "",
        .s_count        = ATOMIC_INIT(1),
-        .s_flags        = SYSFS_DIR,
+        .s_flags        = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
        .s_mode         = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
        .s_ino          = 1,
 };
@@ -50,11 +50,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = SYSFS_MAGIC;
        sb->s_op = &sysfs_ops;
        sb->s_time_gran = 1;
-        sysfs_sb = sb;
        /* get root inode, initialize and unlock it */
        mutex_lock(&sysfs_mutex);
-        inode = sysfs_get_inode(&sysfs_root);
+        inode = sysfs_get_inode(sb, &sysfs_root);
        mutex_unlock(&sysfs_mutex);
        if (!inode) {
                pr_debug("sysfs: could not get root inode\n");
@@ -73,18 +72,102 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
+static int sysfs_test_super(struct super_block *sb, void *data)
+{
+        struct sysfs_super_info *sb_info = sysfs_info(sb);
+        struct sysfs_super_info *info = data;
+        enum kobj_ns_type type;
+        int found = 1;
+        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
+                if (sb_info->ns[type] != info->ns[type])
+                        found = 0;
+        }
+        return found;
+}
+static int sysfs_set_super(struct super_block *sb, void *data)
+{
+        int error;
+        error = set_anon_super(sb, data);
+        if (!error)
+                sb->s_fs_info = data;
+        return error;
+}
 static int sysfs_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
+        struct sysfs_super_info *info;
+        enum kobj_ns_type type;
+        struct super_block *sb;
+        int error;
+        error = -ENOMEM;
+        info = kzalloc(sizeof(*info), GFP_KERNEL);
+        if (!info)
+                goto out;
+        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
+                info->ns[type] = kobj_ns_current(type);
+        sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
+        if (IS_ERR(sb) || sb->s_fs_info != info)
+                kfree(info);
+        if (IS_ERR(sb)) {
+                error = PTR_ERR(sb);
+                goto out;
+        }
+        if (!sb->s_root) {
+                sb->s_flags = flags;
+                error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+                if (error) {
+                        deactivate_locked_super(sb);
+                        goto out;
+                }
+                sb->s_flags |= MS_ACTIVE;
+        }
+        simple_set_mnt(mnt, sb);
+        error = 0;
+out:
+        return error;
+}
+static void sysfs_kill_sb(struct super_block *sb)
+{
+        struct sysfs_super_info *info = sysfs_info(sb);
+        kill_anon_super(sb);
+        kfree(info);
 }
 static struct file_system_type sysfs_fs_type = {
        .name           = "sysfs",
        .get_sb         = sysfs_get_sb,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = sysfs_kill_sb,
 };
+void sysfs_exit_ns(enum kobj_ns_type type, const void *ns)
+{
+        struct super_block *sb;
+        mutex_lock(&sysfs_mutex);
+        spin_lock(&sb_lock);
+        list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
+                struct sysfs_super_info *info = sysfs_info(sb);
+                /* Ignore superblocks that are in the process of unmounting */
+                if (sb->s_count <= S_BIAS)
+                        continue;
+                /* Ignore superblocks with the wrong ns */
+                if (info->ns[type] != ns)
+                        continue;
+                info->ns[type] = NULL;
+        }
+        spin_unlock(&sb_lock);
+        mutex_unlock(&sysfs_mutex);
+}
 int __init sysfs_init(void)
 {
        int err = -ENOMEM;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c5eff49fa41b..b6ebdaa00f37 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,6 +11,7 @@
 */
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
@@ -57,6 +58,8 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
        if (!sd)
                goto out_put;
+        if (sysfs_ns_type(parent_sd))
+                sd->s_ns = target->ktype->namespace(target);
        sd->s_symlink.target_sd = target_sd;
        target_sd = NULL;       /* reference is now owned by the symlink */
@@ -120,7 +123,52 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
        else
                parent_sd = kobj->sd;
-        sysfs_hash_and_remove(parent_sd, name);
+        sysfs_hash_and_remove(parent_sd, NULL, name);
+}
+/**
+ *      sysfs_rename_link - rename symlink in object's directory.
+ *      @kobj:  object we're acting for.
+ *      @targ:  object we're pointing to.
+ *      @old:   previous name of the symlink.
+ *      @new:   new name of the symlink.
+ *
+ *      A helper function for the common rename symlink idiom.
+ */
+int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
+                        const char *old, const char *new)
+{
+        struct sysfs_dirent *parent_sd, *sd = NULL;
+        const void *old_ns = NULL, *new_ns = NULL;
+        int result;
+        if (!kobj)
+                parent_sd = &sysfs_root;
+        else
+                parent_sd = kobj->sd;
+        if (targ->sd)
+                old_ns = targ->sd->s_ns;
+        result = -ENOENT;
+        sd = sysfs_get_dirent(parent_sd, old_ns, old);
+        if (!sd)
+                goto out;
+        result = -EINVAL;
+        if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
+                goto out;
+        if (sd->s_symlink.target_sd->s_dir.kobj != targ)
+                goto out;
+        if (sysfs_ns_type(parent_sd))
+                new_ns = targ->ktype->namespace(targ);
+        result = sysfs_rename(sd, parent_sd, new_ns, new);
+out:
+        sysfs_put(sd);
+        return result;
 }
 static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
@@ -222,3 +270,4 @@ const struct inode_operations sysfs_symlink_inode_operations = {
 EXPORT_SYMBOL_GPL(sysfs_create_link);
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
+EXPORT_SYMBOL_GPL(sysfs_rename_link);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index cdd9377a6e06..93847d54c2e3 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -58,6 +58,7 @@ struct sysfs_dirent {
        struct sysfs_dirent     *s_sibling;
        const char              *s_name;
+        const void              *s_ns;
        union {
                struct sysfs_elem_dir           s_dir;
                struct sysfs_elem_symlink       s_symlink;
@@ -66,8 +67,8 @@ struct sysfs_dirent {
        };
        unsigned int            s_flags;
+        unsigned short          s_mode;
        ino_t                   s_ino;
-        umode_t                 s_mode;
        struct sysfs_inode_attrs *s_iattr;
 };
@@ -79,21 +80,33 @@ struct sysfs_dirent {
 #define SYSFS_KOBJ_BIN_ATTR             0x0004
 #define SYSFS_KOBJ_LINK                 0x0008
 #define SYSFS_COPY_NAME                 (SYSFS_DIR | SYSFS_KOBJ_LINK)
+#define SYSFS_ACTIVE_REF                (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
-#define SYSFS_FLAG_MASK                 ~SYSFS_TYPE_MASK
+#define SYSFS_NS_TYPE_MASK              0xff00
-#define SYSFS_FLAG_REMOVED              0x0200
+#define SYSFS_NS_TYPE_SHIFT             8
+#define SYSFS_FLAG_MASK                 ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
+#define SYSFS_FLAG_REMOVED              0x020000
 static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 {
        return sd->s_flags & SYSFS_TYPE_MASK;
 }
+static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
+{
+        return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
+}
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define sysfs_dirent_init_lockdep(sd)                           \
 do {                                                            \
-        static struct lock_class_key __key;                     \
+        struct attribute *attr = sd->s_attr.attr;               \
+        struct lock_class_key *key = attr->key;                 \
+        if (!key)                                               \
+                key = &attr->skey;                              \
                                                                \
-        lockdep_init_map(&sd->dep_map, "s_active", &__key, 0);  \
+        lockdep_init_map(&sd->dep_map, "s_active", key, 0);     \
 } while(0)
 #else
 #define sysfs_dirent_init_lockdep(sd) do {} while(0)
@@ -110,8 +123,11 @@ struct sysfs_addrm_cxt {
 /*
 * mount.c
 */
+struct sysfs_super_info {
+        const void *ns[KOBJ_NS_TYPES];
+};
+#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent sysfs_root;
-extern struct super_block *sysfs_sb;
 extern struct kmem_cache *sysfs_dir_cachep;
 /*
@@ -124,8 +140,8 @@ extern const struct file_operations sysfs_dir_operations;
 extern const struct inode_operations sysfs_dir_inode_operations;
 struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
-struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
-void sysfs_put_active_two(struct sysfs_dirent *sd);
+void sysfs_put_active(struct sysfs_dirent *sd);
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
                       struct sysfs_dirent *parent_sd);
 int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
@@ -134,8 +150,10 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+                                       const void *ns,
                                       const unsigned char *name);
 struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+                                      const void *ns,
                                      const unsigned char *name);
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
@@ -146,7 +164,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 void sysfs_remove_subdir(struct sysfs_dirent *sd);
 int sysfs_rename(struct sysfs_dirent *sd,
-        struct sysfs_dirent *new_parent_sd, const char *new_name);
+        struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name);
 static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
 {
@@ -168,7 +186,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 /*
 * inode.c
 */
-struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
+struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
 void sysfs_delete_inode(struct inode *inode);
 int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
 int sysfs_permission(struct inode *inode, int mask);
@@ -176,7 +194,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                size_t size, int flags);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name);
 int sysfs_inode_init(void);
 /*
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 4e50286a4cc3..1dabed286b4c 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -164,8 +164,8 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
                                                        name, de->name))
                                        goto found;
                        }
+                        dir_put_page(page);
                }
-                dir_put_page(page);
                if (++n >= npages)
                        n = 0;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 1bfc95ad5f71..b86ab8eff79a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/time.h>
@@ -109,31 +110,14 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
        struct timerfd_ctx *ctx = file->private_data;
        ssize_t res;
        u64 ticks = 0;
-        DECLARE_WAITQUEUE(wait, current);
        if (count < sizeof(ticks))
                return -EINVAL;
        spin_lock_irq(&ctx->wqh.lock);
-        res = -EAGAIN;
+        if (file->f_flags & O_NONBLOCK)
-        if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) {
+                res = -EAGAIN;
-                __add_wait_queue(&ctx->wqh, &wait);
+        else
-                for (res = 0;;) {
+                res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        if (ctx->ticks) {
-                                res = 0;
-                                break;
-                        }
-                        if (signal_pending(current)) {
-                                res = -ERESTARTSYS;
-                                break;
-                        }
-                        spin_unlock_irq(&ctx->wqh.lock);
-                        schedule();
-                        spin_lock_irq(&ctx->wqh.lock);
-                }
-                __remove_wait_queue(&ctx->wqh, &wait);
-                __set_current_state(TASK_RUNNING);
-        }
        if (ctx->ticks) {
                ticks = ctx->ticks;
                if (ctx->expired && ctx->tintv.tv64) {
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 4775af401167..37fa7ed062d8 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -45,6 +45,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /**
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 90492327b383..c2a68baa782f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,6 +34,7 @@
 #include <linux/moduleparam.h>
 #include <linux/debugfs.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 #ifdef CONFIG_UBIFS_FS_DEBUG
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e26c02ab6cd5..5692cf72b807 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -52,6 +52,7 @@
 #include "ubifs.h"
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 static int read_block(struct inode *inode, void *addr, unsigned int block,
                      struct ubifs_data_node *dn)
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index e5a3d8e96bb7..918d1582ca05 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -53,6 +53,7 @@
 * good, and GC takes extra care when moving them.
 */
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/list_sort.h>
 #include "ubifs.h"
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e589fedaf1ef..bcf5a16f30bb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -51,6 +51,7 @@
 */
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /**
@@ -63,6 +64,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
        if (!c->ro_media) {
                c->ro_media = 1;
                c->no_chk_data_crc = 0;
+                c->vfs_sb->s_flags |= MS_RDONLY;
                ubifs_warn("switched to read-only mode, error %d", err);
                dbg_dump_stack();
        }
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index b2792e84d245..ad7f67b827ea 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -46,6 +46,7 @@
 #include "ubifs.h"
 #include <linux/crc16.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 /**
 * do_calc_lpt_geom - calculate sizes for the LPT area.
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 8cbfb8248025..13cb7a4237bf 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -26,6 +26,7 @@
 */
 #include <linux/crc16.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 868a55ee080f..109c6ea03bb5 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -31,6 +31,7 @@
 */
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /**
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 57085e43320f..96cb62c8a9dd 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -27,6 +27,7 @@
 */
 #include "ubifs.h"
+#include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/math64.h>
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e5b1a7d00fa0..2194915220e5 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -31,6 +31,7 @@
 */
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2d976366a46..bd2542dad014 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -28,6 +28,7 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 195830f47569..c74400f88fe0 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -56,6 +56,7 @@
 */
 #include "ubifs.h"
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index ccc3ad7242d4..9a9378b4eb5a 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -31,55 +31,8 @@
 #define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
 #define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
 #define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
-#define udf_find_first_one_bit(addr, size) find_first_one_bit(addr, size)
 #define udf_find_next_one_bit(addr, size, offset) \
-                find_next_one_bit(addr, size, offset)
+                ext2_find_next_bit(addr, size, offset)
-#define leBPL_to_cpup(x) leNUM_to_cpup(BITS_PER_LONG, x)
-#define leNUM_to_cpup(x, y) xleNUM_to_cpup(x, y)
-#define xleNUM_to_cpup(x, y) (le ## x ## _to_cpup(y))
-#define uintBPL_t uint(BITS_PER_LONG)
-#define uint(x) xuint(x)
-#define xuint(x) __le ## x
-static inline int find_next_one_bit(void *addr, int size, int offset)
-{
-        uintBPL_t *p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG);
-        int result = offset & ~(BITS_PER_LONG - 1);
-        unsigned long tmp;
-        if (offset >= size)
-                return size;
-        size -= result;
-        offset &= (BITS_PER_LONG - 1);
-        if (offset) {
-                tmp = leBPL_to_cpup(p++);
-                tmp &= ~0UL << offset;
-                if (size < BITS_PER_LONG)
-                        goto found_first;
-                if (tmp)
-                        goto found_middle;
-                size -= BITS_PER_LONG;
-                result += BITS_PER_LONG;
-        }
-        while (size & ~(BITS_PER_LONG - 1)) {
-                tmp = leBPL_to_cpup(p++);
-                if (tmp)
-                        goto found_middle;
-                result += BITS_PER_LONG;
-                size -= BITS_PER_LONG;
-        }
-        if (!size)
-                return result;
-        tmp = leBPL_to_cpup(p);
-found_first:
-        tmp &= ~0UL >> (BITS_PER_LONG - size);
-found_middle:
-        return result + ffz(~tmp);
-}
-#define find_first_one_bit(addr, size)\
-        find_next_one_bit((addr), (size), 0)
 static int read_block_bitmap(struct super_block *sb,
                             struct udf_bitmap *bitmap, unsigned int block,
@@ -172,9 +125,8 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
        mutex_lock(&sbi->s_alloc_mutex);
        partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
-        if (bloc->logicalBlockNum < 0 ||
+        if (bloc->logicalBlockNum + count < count ||
-            (bloc->logicalBlockNum + count) >
+            (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
-                partmap->s_partition_len) {
                udf_debug("%d < %d || %d + %d > %d\n",
                          bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
                          count, partmap->s_partition_len);
@@ -440,9 +392,8 @@ static void udf_table_free_blocks(struct super_block *sb,
        mutex_lock(&sbi->s_alloc_mutex);
        partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
-        if (bloc->logicalBlockNum < 0 ||
+        if (bloc->logicalBlockNum + count < count ||
-            (bloc->logicalBlockNum + count) >
+            (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
-                partmap->s_partition_len) {
                udf_debug("%d < %d || %d + %d > %d\n",
                          bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
                          partmap->s_partition_len);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1eb06774ed90..4b6a46ccbf46 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -218,7 +218,7 @@ const struct file_operations udf_file_operations = {
        .llseek                 = generic_file_llseek,
 };
-static int udf_setattr(struct dentry *dentry, struct iattr *iattr)
+int udf_setattr(struct dentry *dentry, struct iattr *iattr)
 {
        struct inode *inode = dentry->d_inode;
        int error;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index b57ab0402d89..8a3fbd177cab 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -106,7 +106,7 @@ void udf_clear_inode(struct inode *inode)
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
            inode->i_size != iinfo->i_lenExtents) {
                printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
-                        "inode size %llu different from extent lenght %llu. "
+                        "inode size %llu different from extent length %llu. "
                        "Filesystem need not be standards compliant.\n",
                        inode->i_sb->s_id, inode->i_ino, inode->i_mode,
                        (unsigned long long)inode->i_size,
@@ -1314,7 +1314,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                break;
        case ICBTAG_FILE_TYPE_SYMLINK:
                inode->i_data.a_ops = &udf_symlink_aops;
-                inode->i_op = &page_symlink_inode_operations;
+                inode->i_op = &udf_symlink_inode_operations;
                inode->i_mode = S_IFLNK | S_IRWXUGO;
                break;
        case ICBTAG_FILE_TYPE_MAIN:
@@ -1408,20 +1408,19 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        struct udf_inode_info *iinfo = UDF_I(inode);
-        bh = udf_tread(inode->i_sb,
+        bh = udf_tgetblk(inode->i_sb,
-                        udf_get_lb_pblock(inode->i_sb,
+                        udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0));
-                                          &iinfo->i_location, 0));
        if (!bh) {
-                udf_debug("bread failure\n");
+                udf_debug("getblk failure\n");
-                return -EIO;
+                return -ENOMEM;
        }
-        memset(bh->b_data, 0x00, inode->i_sb->s_blocksize);
+        lock_buffer(bh);
+        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
        fe = (struct fileEntry *)bh->b_data;
        efe = (struct extendedFileEntry *)bh->b_data;
-        if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
+        if (iinfo->i_use) {
                struct unallocSpaceEntry *use =
                        (struct unallocSpaceEntry *)bh->b_data;
@@ -1429,20 +1428,18 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                memcpy(bh->b_data + sizeof(struct unallocSpaceEntry),
                       iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
                                        sizeof(struct unallocSpaceEntry));
+                use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
+                use->descTag.tagLocation =
+                                cpu_to_le32(iinfo->i_location.logicalBlockNum);
                crclen = sizeof(struct unallocSpaceEntry) +
                                iinfo->i_lenAlloc - sizeof(struct tag);
-                use->descTag.tagLocation = cpu_to_le32(
-                                                iinfo->i_location.
-                                                        logicalBlockNum);
                use->descTag.descCRCLength = cpu_to_le16(crclen);
                use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
                                                           sizeof(struct tag),
                                                           crclen));
                use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
-                mark_buffer_dirty(bh);
+                goto out;
-                brelse(bh);
-                return err;
        }
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
@@ -1597,18 +1594,21 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number);
        fe->descTag.tagLocation = cpu_to_le32(
                                        iinfo->i_location.logicalBlockNum);
-        crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
+        crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag);
-                                                                sizeof(struct tag);
        fe->descTag.descCRCLength = cpu_to_le16(crclen);
        fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
                                                  crclen));
        fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
+out:
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
        /* write the data blocks */
        mark_buffer_dirty(bh);
        if (do_sync) {
                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                if (buffer_write_io_error(bh)) {
                        printk(KERN_WARNING "IO error syncing udf inode "
                                "[%s:%08lx]\n", inode->i_sb->s_id,
                                inode->i_ino);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index db423ab078b1..75816025f95f 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -925,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        iinfo = UDF_I(inode);
        inode->i_mode = S_IFLNK | S_IRWXUGO;
        inode->i_data.a_ops = &udf_symlink_aops;
-        inode->i_op = &page_symlink_inode_operations;
+        inode->i_op = &udf_symlink_inode_operations;
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
                struct kernel_lb_addr eloc;
@@ -1393,6 +1393,7 @@ const struct export_operations udf_export_ops = {
 const struct inode_operations udf_dir_inode_operations = {
        .lookup                         = udf_lookup,
        .create                         = udf_create,
+        .setattr                        = udf_setattr,
        .link                           = udf_link,
        .unlink                         = udf_unlink,
        .symlink                        = udf_symlink,
@@ -1401,3 +1402,9 @@ const struct inode_operations udf_dir_inode_operations = {
        .mknod                          = udf_mknod,
        .rename                         = udf_rename,
 };
+const struct inode_operations udf_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+        .setattr        = udf_setattr,
+};
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4b540ee632d5..745eb209be0c 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
 #include <linux/fs.h>
 #include <linux/string.h>
-#include <linux/slab.h>
 #include <linux/buffer_head.h>
 uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 852e91845688..16064787d2b7 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -26,7 +26,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 4223ac855da9..702a1148e702 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -76,6 +76,7 @@ extern const struct inode_operations udf_dir_inode_operations;
 extern const struct file_operations udf_dir_operations;
 extern const struct inode_operations udf_file_inode_operations;
 extern const struct file_operations udf_file_operations;
+extern const struct inode_operations udf_symlink_inode_operations;
 extern const struct address_space_operations udf_aops;
 extern const struct address_space_operations udf_adinicb_aops;
 extern const struct address_space_operations udf_symlink_aops;
@@ -131,7 +132,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 /* file.c */
 extern int udf_ioctl(struct inode *, struct file *, unsigned int,
                     unsigned long);
+extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
 extern int udf_sync_inode(struct inode *);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index cefa8c8913e6..d03a90b6ad69 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>       /* for memset */
 #include <linux/nls.h>
 #include <linux/crc-itu-t.h>
+#include <linux/slab.h>
 #include "udf_sb.h"
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 66b63a751615..14743d935a93 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1016,6 +1016,9 @@ magic_found:
                case UFS_FSSTABLE:
                        UFSD("fs is stable\n");
                        break;
+                case UFS_FSLOG:
+                        UFSD("fs is logging fs\n");
+                        break;
                case UFS_FSOSF1:
                        UFSD("fs is DEC OSF/1\n");
                        break;
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 54bde1895a80..6943ec677c0b 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -138,6 +138,7 @@ typedef __u16 __bitwise __fs16;
 #define UFS_USEEFT  ((__u16)65535)
+/* fs_clean values */
 #define UFS_FSOK      0x7c269d38
 #define UFS_FSACTIVE  ((__s8)0x00)
 #define UFS_FSCLEAN   ((__s8)0x01)
@@ -145,6 +146,11 @@ typedef __u16 __bitwise __fs16;
 #define UFS_FSOSF1    ((__s8)0x03)      /* is this correct for DEC OSF/1? */
 #define UFS_FSBAD     ((__s8)0xff)
+/* Solaris-specific fs_clean values */
+#define UFS_FSSUSPEND ((__s8)0xfe)      /* temporarily suspended */
+#define UFS_FSLOG     ((__s8)0xfd)      /* logging fs */
+#define UFS_FSFIX     ((__s8)0xfc)      /* being repaired while mounted */
 /* From here to next blank line, s_flags for ufs_sb_info */
 /* directory entry encoding */
 #define UFS_DE_MASK             0x00000010      /* mask for the following */
@@ -227,11 +233,16 @@ typedef __u16 __bitwise __fs16;
 */
 #define ufs_cbtocylno(bno) \
        ((bno) * uspi->s_nspf / uspi->s_spc)
-#define ufs_cbtorpos(bno) \
+#define ufs_cbtorpos(bno)                                     \
+        ((UFS_SB(sb)->s_flags & UFS_CG_SUN) ?                 \
+         (((((bno) * uspi->s_nspf % uspi->s_spc) %            \
+            uspi->s_nsect) *                                  \
+           uspi->s_nrpos) / uspi->s_nsect)                    \
+         :                                                    \
        ((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \
        * uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \
        % uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \
-        * uspi->s_nrpos) / uspi->s_npsect)
+          * uspi->s_nrpos) / uspi->s_npsect))
 /*
 * The following macros optimize certain frequently calculated
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 05ac0fe9c4d3..8d5a506c82eb 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -6,9 +6,9 @@
 */
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/gfp.h>
 /*
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index bc7405585def..666c9db48eb6 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -17,6 +17,7 @@
 */
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index bf85bbe4a9ae..a7bc925c4d60 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -22,6 +22,7 @@
 #include "xfs_inode.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 9083357f9e44..089eaca860b4 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -40,10 +40,20 @@
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
+#include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
+/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+        IO_READ,        /* mapping for a read */
+        IO_DELAY,       /* mapping covers delalloc region */
+        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+        IO_NEW          /* just allocated */
+};
 /*
 * Prime number of hash buckets since address is used as the key.
@@ -102,8 +112,9 @@ xfs_count_page_state(
 STATIC struct block_device *
 xfs_find_bdev_for_inode(
-        struct xfs_inode        *ip)
+        struct inode            *inode)
 {
+        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
        if (XFS_IS_REALTIME_INODE(ip))
@@ -182,7 +193,7 @@ xfs_setfilesize(
        xfs_fsize_t             isize;
        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        ASSERT(ioend->io_type != IOMAP_READ);
+        ASSERT(ioend->io_type != IO_READ);
        if (unlikely(ioend->io_error))
                return 0;
@@ -213,7 +224,7 @@ xfs_finish_ioend(
        if (atomic_dec_and_test(&ioend->io_remaining)) {
                struct workqueue_struct *wq;
-                wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
+                wq = (ioend->io_type == IO_UNWRITTEN) ?
                        xfsconvertd_workqueue : xfsdatad_workqueue;
                queue_work(wq, &ioend->io_work);
                if (wait)
@@ -236,7 +247,7 @@ xfs_end_io(
         * For unwritten extents we need to issue transactions to convert a
         * range to normal written extens after the data I/O has finished.
         */
-        if (ioend->io_type == IOMAP_UNWRITTEN &&
+        if (ioend->io_type == IO_UNWRITTEN &&
            likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
@@ -249,7 +260,7 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-        if (ioend->io_type != IOMAP_READ) {
+        if (ioend->io_type != IO_READ) {
                error = xfs_setfilesize(ioend);
                ASSERT(!error || error == EAGAIN);
        }
@@ -308,21 +319,25 @@ xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
        ssize_t                 count,
-        xfs_iomap_t             *mapp,
+        struct xfs_bmbt_irec    *imap,
        int                     flags)
 {
        int                     nmaps = 1;
+        int                     new = 0;
-        return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
+        return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
 }
 STATIC int
-xfs_iomap_valid(
+xfs_imap_valid(
-        xfs_iomap_t             *iomapp,
+        struct inode            *inode,
-        loff_t                  offset)
+        struct xfs_bmbt_irec    *imap,
+        xfs_off_t               offset)
 {
-        return offset >= iomapp->iomap_offset &&
+        offset >>= inode->i_blkbits;
-                offset < iomapp->iomap_offset + iomapp->iomap_bsize;
+        return offset >= imap->br_startoff &&
+                offset < imap->br_startoff + imap->br_blockcount;
 }
 /*
@@ -553,19 +568,23 @@ xfs_add_to_ioend(
 STATIC void
 xfs_map_buffer(
+        struct inode            *inode,
        struct buffer_head      *bh,
-        xfs_iomap_t             *mp,
+        struct xfs_bmbt_irec    *imap,
-        xfs_off_t               offset,
+        xfs_off_t               offset)
-        uint                    block_bits)
 {
        sector_t                bn;
+        struct xfs_mount        *m = XFS_I(inode)->i_mount;
+        xfs_off_t               iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
+        xfs_daddr_t             iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
-        ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL);
+        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-        bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) +
+        bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
-              ((offset - mp->iomap_offset) >> block_bits);
+              ((offset - iomap_offset) >> inode->i_blkbits);
-        ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME));
+        ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
        bh->b_blocknr = bn;
        set_buffer_mapped(bh);
@@ -573,17 +592,17 @@ xfs_map_buffer(
 STATIC void
 xfs_map_at_offset(
+        struct inode            *inode,
        struct buffer_head      *bh,
-        loff_t                  offset,
+        struct xfs_bmbt_irec    *imap,
-        int                     block_bits,
+        xfs_off_t               offset)
-        xfs_iomap_t             *iomapp)
 {
-        ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
+        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-        ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
+        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
        lock_buffer(bh);
-        xfs_map_buffer(bh, iomapp, offset, block_bits);
+        xfs_map_buffer(inode, bh, imap, offset);
-        bh->b_bdev = iomapp->iomap_target->bt_bdev;
+        bh->b_bdev = xfs_find_bdev_for_inode(inode);
        set_buffer_mapped(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
@@ -712,11 +731,11 @@ xfs_is_delayed_page(
                bh = head = page_buffers(page);
                do {
                        if (buffer_unwritten(bh))
-                                acceptable = (type == IOMAP_UNWRITTEN);
+                                acceptable = (type == IO_UNWRITTEN);
                        else if (buffer_delay(bh))
-                                acceptable = (type == IOMAP_DELAY);
+                                acceptable = (type == IO_DELAY);
                        else if (buffer_dirty(bh) && buffer_mapped(bh))
-                                acceptable = (type == IOMAP_NEW);
+                                acceptable = (type == IO_NEW);
                        else
                                break;
                } while ((bh = bh->b_this_page) != head);
@@ -739,7 +758,7 @@ xfs_convert_page(
        struct inode            *inode,
        struct page             *page,
        loff_t                  tindex,
-        xfs_iomap_t             *mp,
+        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
        int                     startio,
@@ -749,7 +768,6 @@ xfs_convert_page(
        xfs_off_t               end_offset;
        unsigned long           p_offset;
        unsigned int            type;
-        int                     bbits = inode->i_blkbits;
        int                     len, page_dirty;
        int                     count = 0, done = 0, uptodate = 1;
        xfs_off_t               offset = page_offset(page);
@@ -801,19 +819,19 @@ xfs_convert_page(
                if (buffer_unwritten(bh) || buffer_delay(bh)) {
                        if (buffer_unwritten(bh))
-                                type = IOMAP_UNWRITTEN;
+                                type = IO_UNWRITTEN;
                        else
-                                type = IOMAP_DELAY;
+                                type = IO_DELAY;
-                        if (!xfs_iomap_valid(mp, offset)) {
+                        if (!xfs_imap_valid(inode, imap, offset)) {
                                done = 1;
                                continue;
                        }
-                        ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
+                        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-                        ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
+                        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-                        xfs_map_at_offset(bh, offset, bbits, mp);
+                        xfs_map_at_offset(inode, bh, imap, offset);
                        if (startio) {
                                xfs_add_to_ioend(inode, bh, offset,
                                                type, ioendp, done);
@@ -825,7 +843,7 @@ xfs_convert_page(
                        page_dirty--;
                        count++;
                } else {
-                        type = IOMAP_NEW;
+                        type = IO_NEW;
                        if (buffer_mapped(bh) && all_bh && startio) {
                                lock_buffer(bh);
                                xfs_add_to_ioend(inode, bh, offset,
@@ -865,7 +883,7 @@ STATIC void
 xfs_cluster_write(
        struct inode            *inode,
        pgoff_t                 tindex,
-        xfs_iomap_t             *iomapp,
+        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
        int                     startio,
@@ -884,7 +902,7 @@ xfs_cluster_write(
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        iomapp, ioendp, wbc, startio, all_bh);
+                                        imap, ioendp, wbc, startio, all_bh);
                        if (done)
                                break;
                }
@@ -929,7 +947,10 @@ xfs_aops_discard_page(
        loff_t                  offset = page_offset(page);
        ssize_t                 len = 1 << inode->i_blkbits;
-        if (!xfs_is_delayed_page(page, IOMAP_DELAY))
+        if (!xfs_is_delayed_page(page, IO_DELAY))
+                goto out_invalidate;
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                goto out_invalidate;
        xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
@@ -964,8 +985,10 @@ xfs_aops_discard_page(
                if (error) {
                        /* something screwed, just bail */
-                        xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                        "page discard failed delalloc mapping lookup.");
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                                "page discard failed delalloc mapping lookup.");
+                        }
                        break;
                }
                if (!nimaps) {
@@ -991,8 +1014,10 @@ xfs_aops_discard_page(
                ASSERT(!flist.xbf_count && !flist.xbf_first);
                if (error) {
                        /* something screwed, just bail */
-                        xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
                        "page discard unable to remove delalloc mapping.");
+                        }
                        break;
                }
 next_buffer:
@@ -1034,15 +1059,15 @@ xfs_page_state_convert(
        int             unmapped) /* also implies page uptodate */
 {
        struct buffer_head      *bh, *head;
-        xfs_iomap_t             iomap;
+        struct xfs_bmbt_irec    imap;
        xfs_ioend_t             *ioend = NULL, *iohead = NULL;
        loff_t                  offset;
        unsigned long           p_offset = 0;
        unsigned int            type;
        __uint64_t              end_offset;
-        pgoff_t                 end_index, last_index, tlast;
+        pgoff_t                 end_index, last_index;
        ssize_t                 size, len;
-        int                     flags, err, iomap_valid = 0, uptodate = 1;
+        int                     flags, err, imap_valid = 0, uptodate = 1;
        int                     page_dirty, count = 0;
        int                     trylock = 0;
        int                     all_bh = unmapped;
@@ -1089,7 +1114,7 @@ xfs_page_state_convert(
        bh = head = page_buffers(page);
        offset = page_offset(page);
        flags = BMAPI_READ;
-        type = IOMAP_NEW;
+        type = IO_NEW;
        /* TODO: cleanup count and page_dirty */
@@ -1103,12 +1128,12 @@ xfs_page_state_convert(
                         * the iomap is actually still valid, but the ioend
                         * isn't.  shouldn't happen too often.
                         */
-                        iomap_valid = 0;
+                        imap_valid = 0;
                        continue;
                }
-                if (iomap_valid)
+                if (imap_valid)
-                        iomap_valid = xfs_iomap_valid(&iomap, offset);
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
                /*
                 * First case, map an unwritten extent and prepare for
@@ -1129,20 +1154,20 @@ xfs_page_state_convert(
                         * Make sure we don't use a read-only iomap
                         */
                        if (flags == BMAPI_READ)
-                                iomap_valid = 0;
+                                imap_valid = 0;
                        if (buffer_unwritten(bh)) {
-                                type = IOMAP_UNWRITTEN;
+                                type = IO_UNWRITTEN;
                                flags = BMAPI_WRITE | BMAPI_IGNSTATE;
                        } else if (buffer_delay(bh)) {
-                                type = IOMAP_DELAY;
+                                type = IO_DELAY;
                                flags = BMAPI_ALLOCATE | trylock;
                        } else {
-                                type = IOMAP_NEW;
+                                type = IO_NEW;
                                flags = BMAPI_WRITE | BMAPI_MMAP;
                        }
-                        if (!iomap_valid) {
+                        if (!imap_valid) {
                                /*
                                 * if we didn't have a valid mapping then we
                                 * need to ensure that we put the new mapping
@@ -1152,7 +1177,7 @@ xfs_page_state_convert(
                                 * for unwritten extent conversion.
                                 */
                                new_ioend = 1;
-                                if (type == IOMAP_NEW) {
+                                if (type == IO_NEW) {
                                        size = xfs_probe_cluster(inode,
                                                        page, bh, head, 0);
                                } else {
@@ -1160,14 +1185,14 @@ xfs_page_state_convert(
                                }
                                err = xfs_map_blocks(inode, offset, size,
-                                                &iomap, flags);
+                                                &imap, flags);
                                if (err)
                                        goto error;
-                                iomap_valid = xfs_iomap_valid(&iomap, offset);
+                                imap_valid = xfs_imap_valid(inode, &imap,
+                                                            offset);
                        }
-                        if (iomap_valid) {
+                        if (imap_valid) {
-                                xfs_map_at_offset(bh, offset,
+                                xfs_map_at_offset(inode, bh, &imap, offset);
-                                                inode->i_blkbits, &iomap);
                                if (startio) {
                                        xfs_add_to_ioend(inode, bh, offset,
                                                        type, &ioend,
@@ -1186,40 +1211,41 @@ xfs_page_state_convert(
                         * That means it must already have extents allocated
                         * underneath it. Map the extent by reading it.
                         */
-                        if (!iomap_valid || flags != BMAPI_READ) {
+                        if (!imap_valid || flags != BMAPI_READ) {
                                flags = BMAPI_READ;
                                size = xfs_probe_cluster(inode, page, bh,
                                                                head, 1);
                                err = xfs_map_blocks(inode, offset, size,
-                                                &iomap, flags);
+                                                &imap, flags);
                                if (err)
                                        goto error;
-                                iomap_valid = xfs_iomap_valid(&iomap, offset);
+                                imap_valid = xfs_imap_valid(inode, &imap,
+                                                            offset);
                        }
                        /*
-                         * We set the type to IOMAP_NEW in case we are doing a
+                         * We set the type to IO_NEW in case we are doing a
                         * small write at EOF that is extending the file but
                         * without needing an allocation. We need to update the
                         * file size on I/O completion in this case so it is
                         * the same case as having just allocated a new extent
                         * that we are writing into for the first time.
                         */
-                        type = IOMAP_NEW;
+                        type = IO_NEW;
                        if (trylock_buffer(bh)) {
                                ASSERT(buffer_mapped(bh));
-                                if (iomap_valid)
+                                if (imap_valid)
                                        all_bh = 1;
                                xfs_add_to_ioend(inode, bh, offset, type,
-                                                &ioend, !iomap_valid);
+                                                &ioend, !imap_valid);
                                page_dirty--;
                                count++;
                        } else {
-                                iomap_valid = 0;
+                                imap_valid = 0;
                        }
                } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
                           (unmapped || startio)) {
-                        iomap_valid = 0;
+                        imap_valid = 0;
                }
                if (!iohead)
@@ -1233,12 +1259,23 @@ xfs_page_state_convert(
        if (startio)
                xfs_start_page_writeback(page, 1, count);
-        if (ioend && iomap_valid) {
+        if (ioend && imap_valid) {
-                offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
+                xfs_off_t               end_index;
-                                        PAGE_CACHE_SHIFT;
-                tlast = min_t(pgoff_t, offset, last_index);
+                end_index = imap.br_startoff + imap.br_blockcount;
-                xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
-                                        wbc, startio, all_bh, tlast);
+                /* to bytes */
+                end_index <<= inode->i_blkbits;
+                /* to pages */
+                end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
+                /* check against file size */
+                if (end_index > last_index)
+                        end_index = last_index;
+                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
+                                        wbc, startio, all_bh, end_index);
        }
        if (iohead)
@@ -1440,10 +1477,11 @@ __xfs_get_blocks(
        int                     direct,
        bmapi_flags_t           flags)
 {
-        xfs_iomap_t             iomap;
+        struct xfs_bmbt_irec    imap;
        xfs_off_t               offset;
        ssize_t                 size;
-        int                     niomap = 1;
+        int                     nimap = 1;
+        int                     new = 0;
        int                     error;
        offset = (xfs_off_t)iblock << inode->i_blkbits;
@@ -1454,22 +1492,21 @@ __xfs_get_blocks(
                return 0;
        error = xfs_iomap(XFS_I(inode), offset, size,
-                             create ? flags : BMAPI_READ, &iomap, &niomap);
+                             create ? flags : BMAPI_READ, &imap, &nimap, &new);
        if (error)
                return -error;
-        if (niomap == 0)
+        if (nimap == 0)
                return 0;
-        if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
+        if (imap.br_startblock != HOLESTARTBLOCK &&
+            imap.br_startblock != DELAYSTARTBLOCK) {
                /*
                 * For unwritten extents do not report a disk address on
                 * the read case (treat as if we're reading into a hole).
                 */
-                if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
+                if (create || !ISUNWRITTEN(&imap))
-                        xfs_map_buffer(bh_result, &iomap, offset,
+                        xfs_map_buffer(inode, bh_result, &imap, offset);
-                                       inode->i_blkbits);
+                if (create && ISUNWRITTEN(&imap)) {
-                }
-                if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
                        if (direct)
                                bh_result->b_private = inode;
                        set_buffer_unwritten(bh_result);
@@ -1480,7 +1517,7 @@ __xfs_get_blocks(
         * If this is a realtime file, data may be on a different device.
         * to that pointed to from the buffer_head b_bdev currently.
         */
-        bh_result->b_bdev = iomap.iomap_target->bt_bdev;
+        bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
        /*
         * If we previously allocated a block out beyond eof and we are now
@@ -1494,10 +1531,10 @@ __xfs_get_blocks(
        if (create &&
            ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
             (offset >= i_size_read(inode)) ||
-             (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN))))
+             (new || ISUNWRITTEN(&imap))))
                set_buffer_new(bh_result);
-        if (iomap.iomap_flags & IOMAP_DELAY) {
+        if (imap.br_startblock == DELAYSTARTBLOCK) {
                BUG_ON(direct);
                if (create) {
                        set_buffer_uptodate(bh_result);
@@ -1506,11 +1543,23 @@ __xfs_get_blocks(
                }
        }
+        /*
+         * If this is O_DIRECT or the mpage code calling tell them how large
+         * the mapping is, so that we can avoid repeated get_blocks calls.
+         */
        if (direct || size > (1 << inode->i_blkbits)) {
-                ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
+                xfs_off_t               mapping_size;
-                offset = min_t(xfs_off_t,
-                                iomap.iomap_bsize - iomap.iomap_delta, size);
+                mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
-                bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset);
+                mapping_size <<= inode->i_blkbits;
+                ASSERT(mapping_size > 0);
+                if (mapping_size > size)
+                        mapping_size = size;
+                if (mapping_size > LONG_MAX)
+                        mapping_size = LONG_MAX;
+                bh_result->b_size = mapping_size;
        }
        return 0;
@@ -1568,7 +1617,7 @@ xfs_end_io_direct(
         */
        ioend->io_offset = offset;
        ioend->io_size = size;
-        if (ioend->io_type == IOMAP_READ) {
+        if (ioend->io_type == IO_READ) {
                xfs_finish_ioend(ioend, 0);
        } else if (private && size > 0) {
                xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
@@ -1579,7 +1628,7 @@ xfs_end_io_direct(
                 * didn't map an unwritten extent so switch it's completion
                 * handler.
                 */
-                ioend->io_type = IOMAP_NEW;
+                ioend->io_type = IO_NEW;
                xfs_finish_ioend(ioend, 0);
        }
@@ -1604,10 +1653,10 @@ xfs_vm_direct_IO(
        struct block_device *bdev;
        ssize_t         ret;
-        bdev = xfs_find_bdev_for_inode(XFS_I(inode));
+        bdev = xfs_find_bdev_for_inode(inode);
        iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
-                                        IOMAP_UNWRITTEN : IOMAP_READ);
+                                        IO_UNWRITTEN : IO_READ);
        ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 6f76ba85f193..f01de3c55c43 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -18,7 +18,7 @@
 #include "xfs.h"
 #include <linux/stddef.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/vmalloc.h>
@@ -168,75 +168,6 @@ test_page_region(
 }
 /*
- *      Mapping of multi-page buffers into contiguous virtual space
- */
-typedef struct a_list {
-        void            *vm_addr;
-        struct a_list   *next;
-} a_list_t;
-static a_list_t         *as_free_head;
-static int              as_list_len;
-static DEFINE_SPINLOCK(as_lock);
-/*
- *      Try to batch vunmaps because they are costly.
- */
-STATIC void
-free_address(
-        void            *addr)
-{
-        a_list_t        *aentry;
-#ifdef CONFIG_XEN
-        /*
-         * Xen needs to be able to make sure it can get an exclusive
-         * RO mapping of pages it wants to turn into a pagetable.  If
-         * a newly allocated page is also still being vmap()ed by xfs,
-         * it will cause pagetable construction to fail.  This is a
-         * quick workaround to always eagerly unmap pages so that Xen
-         * is happy.
-         */
-        vunmap(addr);
-        return;
-#endif
-        aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
-        if (likely(aentry)) {
-                spin_lock(&as_lock);
-                aentry->next = as_free_head;
-                aentry->vm_addr = addr;
-                as_free_head = aentry;
-                as_list_len++;
-                spin_unlock(&as_lock);
-        } else {
-                vunmap(addr);
-        }
-}
-STATIC void
-purge_addresses(void)
-{
-        a_list_t        *aentry, *old;
-        if (as_free_head == NULL)
-                return;
-        spin_lock(&as_lock);
-        aentry = as_free_head;
-        as_free_head = NULL;
-        as_list_len = 0;
-        spin_unlock(&as_lock);
-        while ((old = aentry) != NULL) {
-                vunmap(aentry->vm_addr);
-                aentry = aentry->next;
-                kfree(old);
-        }
-}
-/*
 *      Internal xfs_buf_t object manipulation
 */
@@ -337,7 +268,8 @@ xfs_buf_free(
                uint            i;
                if (xfs_buf_is_vmapped(bp))
-                        free_address(bp->b_addr - bp->b_offset);
+                        vm_unmap_ram(bp->b_addr - bp->b_offset,
+                                        bp->b_page_count);
                for (i = 0; i < bp->b_page_count; i++) {
                        struct page     *page = bp->b_pages[i];
@@ -457,10 +389,8 @@ _xfs_buf_map_pages(
                bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
                bp->b_flags |= XBF_MAPPED;
        } else if (flags & XBF_MAPPED) {
-                if (as_list_len > 64)
+                bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-                        purge_addresses();
+                                        -1, PAGE_KERNEL);
-                bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
-                                        VM_MAP, PAGE_KERNEL);
                if (unlikely(bp->b_addr == NULL))
                        return -ENOMEM;
                bp->b_addr += bp->b_offset;
@@ -1077,25 +1007,20 @@ xfs_bwrite(
        struct xfs_mount        *mp,
        struct xfs_buf          *bp)
 {
-        int                     iowait = (bp->b_flags & XBF_ASYNC) == 0;
+        int                     error;
-        int                     error = 0;
        bp->b_strat = xfs_bdstrat_cb;
        bp->b_mount = mp;
        bp->b_flags |= XBF_WRITE;
-        if (!iowait)
+        bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
-                bp->b_flags |= _XBF_RUN_QUEUES;
        xfs_buf_delwri_dequeue(bp);
        xfs_buf_iostrategy(bp);
-        if (iowait) {
+        error = xfs_buf_iowait(bp);
-                error = xfs_buf_iowait(bp);
+        if (error)
-                if (error)
+                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+        xfs_buf_relse(bp);
-                xfs_buf_relse(bp);
-        }
        return error;
 }
@@ -1684,7 +1609,8 @@ xfs_mapping_buftarg(
 STATIC int
 xfs_alloc_delwrite_queue(
-        xfs_buftarg_t           *btp)
+        xfs_buftarg_t           *btp,
+        const char              *fsname)
 {
        int     error = 0;
@@ -1692,7 +1618,7 @@ xfs_alloc_delwrite_queue(
        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
        spin_lock_init(&btp->bt_delwrite_lock);
        btp->bt_flags = 0;
-        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
+        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
        if (IS_ERR(btp->bt_task)) {
                error = PTR_ERR(btp->bt_task);
                goto out_error;
@@ -1705,7 +1631,8 @@ out_error:
 xfs_buftarg_t *
 xfs_alloc_buftarg(
        struct block_device     *bdev,
-        int                     external)
+        int                     external,
+        const char              *fsname)
 {
        xfs_buftarg_t           *btp;
@@ -1717,7 +1644,7 @@ xfs_alloc_buftarg(
                goto error;
        if (xfs_mapping_buftarg(btp, bdev))
                goto error;
-        if (xfs_alloc_delwrite_queue(btp))
+        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
        xfs_alloc_bufhash(btp, external);
        return btp;
@@ -1955,9 +1882,6 @@ xfsbufd(
                        xfs_buf_iostrategy(bp);
                        count++;
                }
-                if (as_list_len > 0)
-                        purge_addresses();
                if (count)
                        blk_run_address_space(target->bt_mapping);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 386e7361e50e..5fbecefa5dfd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -390,7 +390,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
 /*
 *      Handling of buftargs.
 */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
 extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 42dd3bcfba6b..d8fb1b5d6cb5 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -115,6 +115,8 @@ xfs_file_fsync(
        xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        xfs_ioend_wait(ip);
        /*
         * We always need to make sure that the required inode state is safe on
         * disk.  The inode might be clean but we still might need to force the
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4ea1ee18aded..699b60cbab9c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -58,6 +58,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/exportfs.h>
 /*
@@ -526,6 +527,10 @@ xfs_attrmulti_by_handle(
        if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
                return -XFS_ERROR(EFAULT);
+        /* overflow check */
+        if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
+                return -E2BIG;
        dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0bf6d61f0528..9287135e9bfc 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -18,6 +18,7 @@
 #include <linux/compat.h>
 #include <linux/ioctl.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -419,6 +420,10 @@ xfs_compat_attrmulti_by_handle(
                           sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
                return -XFS_ERROR(EFAULT);
+        /* overflow check */
+        if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
+                return -E2BIG;
        dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 61a99608731e..9c8019c78c92 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -56,6 +56,7 @@
 #include <linux/security.h>
 #include <linux/falloc.h>
 #include <linux/fiemap.h>
+#include <linux/slab.h>
 /*
 * Bring the timestamps in the XFS inode uptodate.
@@ -672,7 +673,10 @@ xfs_vn_fiemap(
                bm.bmv_length = BTOBB(length);
        /* We add one because in getbmap world count includes the header */
-        bm.bmv_count = fieinfo->fi_extents_max + 1;
+        bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
+                                        fieinfo->fi_extents_max + 1;
+        bm.bmv_count = min_t(__s32, bm.bmv_count,
+                             (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
        bm.bmv_iflags = BMV_IF_PREALLOC;
        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
                bm.bmv_iflags |= BMV_IF_ATTRFORK;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 71345a370d9f..e9002513e08f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -61,6 +61,7 @@
 #include <linux/namei.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/mempool.h>
 #include <linux/writeback.h>
@@ -788,18 +789,18 @@ xfs_open_devices(
         * Setup xfs_mount buffer target pointers
         */
        error = ENOMEM;
-        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
+        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
        if (!mp->m_ddev_targp)
                goto out_close_rtdev;
        if (rtdev) {
-                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
+                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
                if (!mp->m_rtdev_targp)
                        goto out_free_ddev_targ;
        }
        if (logdev && logdev != ddev) {
-                mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1);
+                mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
                if (!mp->m_logdev_targp)
                        goto out_free_rtdev_targ;
        } else {
@@ -901,7 +902,8 @@ xfsaild_start(
        struct xfs_ail  *ailp)
 {
        ailp->xa_target = 0;
-        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild");
+        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+                                    ailp->xa_mount->m_fsname);
        if (IS_ERR(ailp->xa_task))
                return -PTR_ERR(ailp->xa_task);
        return 0;
@@ -1091,6 +1093,7 @@ xfs_fs_write_inode(
                 * the code will only flush the inode if it isn't already
                 * being flushed.
                 */
+                xfs_ioend_wait(ip);
                xfs_ilock(ip, XFS_ILOCK_SHARED);
                if (ip->i_update_core) {
                        error = xfs_log_inode(ip);
@@ -1208,6 +1211,7 @@ xfs_fs_put_super(
        xfs_unmountfs(mp);
        xfs_freesb(mp);
+        xfs_inode_shrinker_unregister(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
        xfs_dmops_put(mp);
@@ -1621,6 +1625,8 @@ xfs_fs_fill_super(
        if (error)
                goto fail_vnrele;
+        xfs_inode_shrinker_register(mp);
        kfree(mtpt);
        return 0;
@@ -1866,6 +1872,7 @@ init_xfs_fs(void)
                goto out_cleanup_procfs;
        vfs_initquota();
+        xfs_inode_shrinker_init();
        error = register_filesystem(&xfs_fs_type);
        if (error)
@@ -1893,6 +1900,7 @@ exit_xfs_fs(void)
 {
        vfs_exitquota();
        unregister_filesystem(&xfs_fs_type);
+        xfs_inode_shrinker_destroy();
        xfs_sysctl_unregister();
        xfs_cleanup_procfs();
        xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 05cd85317f6f..3884e20bc14e 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -95,7 +95,8 @@ xfs_inode_ag_walk(
                                           struct xfs_perag *pag, int flags),
        int                     flags,
        int                     tag,
-        int                     exclusive)
+        int                     exclusive,
+        int                     *nr_to_scan)
 {
        uint32_t                first_index;
        int                     last_error = 0;
@@ -134,7 +135,7 @@ restart:
                if (error == EFSCORRUPTED)
                        break;
-        } while (1);
+        } while ((*nr_to_scan)--);
        if (skipped) {
                delay(1);
@@ -150,12 +151,15 @@ xfs_inode_ag_iterator(
                                           struct xfs_perag *pag, int flags),
        int                     flags,
        int                     tag,
-        int                     exclusive)
+        int                     exclusive,
+        int                     *nr_to_scan)
 {
        int                     error = 0;
        int                     last_error = 0;
        xfs_agnumber_t          ag;
+        int                     nr;
+        nr = nr_to_scan ? *nr_to_scan : INT_MAX;
        for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
                struct xfs_perag        *pag;
@@ -165,14 +169,18 @@ xfs_inode_ag_iterator(
                        continue;
                }
                error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
-                                                exclusive);
+                                                exclusive, &nr);
                xfs_perag_put(pag);
                if (error) {
                        last_error = error;
                        if (error == EFSCORRUPTED)
                                break;
                }
+                if (nr <= 0)
+                        break;
        }
+        if (nr_to_scan)
+                *nr_to_scan = nr;
        return XFS_ERROR(last_error);
 }
@@ -291,7 +299,7 @@ xfs_sync_data(
        ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
-                                      XFS_ICI_NO_TAG, 0);
+                                      XFS_ICI_NO_TAG, 0, NULL);
        if (error)
                return XFS_ERROR(error);
@@ -310,7 +318,7 @@ xfs_sync_attr(
        ASSERT((flags & ~SYNC_WAIT) == 0);
        return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
-                                     XFS_ICI_NO_TAG, 0);
+                                     XFS_ICI_NO_TAG, 0, NULL);
 }
 STATIC int
@@ -348,68 +356,23 @@ xfs_commit_dummy_trans(
 STATIC int
 xfs_sync_fsdata(
-        struct xfs_mount        *mp,
+        struct xfs_mount        *mp)
-        int                     flags)
 {
        struct xfs_buf          *bp;
-        struct xfs_buf_log_item *bip;
-        int                     error = 0;
-        /*
-         * If this is xfssyncd() then only sync the superblock if we can
-         * lock it without sleeping and it is not pinned.
-         */
-        if (flags & SYNC_TRYLOCK) {
-                ASSERT(!(flags & SYNC_WAIT));
-                bp = xfs_getsb(mp, XBF_TRYLOCK);
-                if (!bp)
-                        goto out;
-                bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
-                if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
-                        goto out_brelse;
-        } else {
-                bp = xfs_getsb(mp, 0);
-                /*
-                 * If the buffer is pinned then push on the log so we won't
-                 * get stuck waiting in the write for someone, maybe
-                 * ourselves, to flush the log.
-                 *
-                 * Even though we just pushed the log above, we did not have
-                 * the superblock buffer locked at that point so it can
-                 * become pinned in between there and here.
-                 */
-                if (XFS_BUF_ISPINNED(bp))
-                        xfs_log_force(mp, 0);
-        }
-        if (flags & SYNC_WAIT)
-                XFS_BUF_UNASYNC(bp);
-        else
-                XFS_BUF_ASYNC(bp);
-        error = xfs_bwrite(mp, bp);
-        if (error)
-                return error;
        /*
-         * If this is a data integrity sync make sure all pending buffers
+         * If the buffer is pinned then push on the log so we won't get stuck
-         * are flushed out for the log coverage check below.
+         * waiting in the write for someone, maybe ourselves, to flush the log.
+         *
+         * Even though we just pushed the log above, we did not have the
+         * superblock buffer locked at that point so it can become pinned in
+         * between there and here.
         */
-        if (flags & SYNC_WAIT)
+        bp = xfs_getsb(mp, 0);
-                xfs_flush_buftarg(mp->m_ddev_targp, 1);
+        if (XFS_BUF_ISPINNED(bp))
+                xfs_log_force(mp, 0);
-        if (xfs_log_need_covered(mp))
-                error = xfs_commit_dummy_trans(mp, flags);
-        return error;
- out_brelse:
+        return xfs_bwrite(mp, bp);
-        xfs_buf_relse(bp);
- out:
-        return error;
 }
 /*
@@ -433,7 +396,7 @@ int
 xfs_quiesce_data(
        struct xfs_mount        *mp)
 {
-        int error;
+        int                     error, error2 = 0;
        /* push non-blocking */
        xfs_sync_data(mp, 0);
@@ -444,13 +407,20 @@ xfs_quiesce_data(
        xfs_qm_sync(mp, SYNC_WAIT);
        /* write superblock and hoover up shutdown errors */
-        error = xfs_sync_fsdata(mp, SYNC_WAIT);
+        error = xfs_sync_fsdata(mp);
+        /* make sure all delwri buffers are written out */
+        xfs_flush_buftarg(mp->m_ddev_targp, 1);
+        /* mark the log as covered if needed */
+        if (xfs_log_need_covered(mp))
+                error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
                XFS_bflush(mp->m_rtdev_targp);
-        return error;
+        return error ? error : error2;
 }
 STATIC void
@@ -573,9 +543,9 @@ xfs_flush_inodes(
 }
 /*
- * Every sync period we need to unpin all items, reclaim inodes, sync
+ * Every sync period we need to unpin all items, reclaim inodes and sync
- * quota and write out the superblock. We might need to cover the log
+ * disk quotas.  We might need to cover the log to indicate that the
- * to indicate it is idle.
+ * filesystem is idle.
 */
 STATIC void
 xfs_sync_worker(
@@ -589,7 +559,8 @@ xfs_sync_worker(
                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
-                error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
+                if (xfs_log_need_covered(mp))
+                        error = xfs_commit_dummy_trans(mp, 0);
        }
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
@@ -652,7 +623,7 @@ xfs_syncd_init(
        mp->m_sync_work.w_syncer = xfs_sync_worker;
        mp->m_sync_work.w_mount = mp;
        mp->m_sync_work.w_completion = NULL;
-        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
+        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
        if (IS_ERR(mp->m_sync_task))
                return -PTR_ERR(mp->m_sync_task);
        return 0;
@@ -673,6 +644,7 @@ __xfs_inode_set_reclaim_tag(
        radix_tree_tag_set(&pag->pag_ici_root,
                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
                           XFS_ICI_RECLAIM_TAG);
+        pag->pag_ici_reclaimable++;
 }
 /*
@@ -705,6 +677,7 @@ __xfs_inode_clear_reclaim_tag(
 {
        radix_tree_tag_clear(&pag->pag_ici_root,
                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+        pag->pag_ici_reclaimable--;
 }
 /*
@@ -820,10 +793,10 @@ xfs_reclaim_inode(
         * call into reclaim to find it in a clean state instead of waiting for
         * it now. We also don't return errors here - if the error is transient
         * then the next reclaim pass will flush the inode, and if the error
-         * is permanent then the next sync reclaim will relcaim the inode and
+         * is permanent then the next sync reclaim will reclaim the inode and
         * pass on the error.
         */
-        if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+        if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
                        "inode 0x%llx background reclaim flush failed with %d",
                        (long long)ip->i_ino, error);
@@ -854,5 +827,93 @@ xfs_reclaim_inodes(
        int             mode)
 {
        return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
-                                        XFS_ICI_RECLAIM_TAG, 1);
+                                        XFS_ICI_RECLAIM_TAG, 1, NULL);
+}
+/*
+ * Shrinker infrastructure.
+ *
+ * This is all far more complex than it needs to be. It adds a global list of
+ * mounts because the shrinkers can only call a global context. We need to make
+ * the shrinkers pass a context to avoid the need for global state.
+ */
+static LIST_HEAD(xfs_mount_list);
+static struct rw_semaphore xfs_mount_list_lock;
+static int
+xfs_reclaim_inode_shrink(
+        int             nr_to_scan,
+        gfp_t           gfp_mask)
+{
+        struct xfs_mount *mp;
+        struct xfs_perag *pag;
+        xfs_agnumber_t  ag;
+        int             reclaimable = 0;
+        if (nr_to_scan) {
+                if (!(gfp_mask & __GFP_FS))
+                        return -1;
+                down_read(&xfs_mount_list_lock);
+                list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
+                        xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
+                                        XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
+                        if (nr_to_scan <= 0)
+                                break;
+                }
+                up_read(&xfs_mount_list_lock);
+        }
+        down_read(&xfs_mount_list_lock);
+        list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
+                for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+                        pag = xfs_perag_get(mp, ag);
+                        if (!pag->pag_ici_init) {
+                                xfs_perag_put(pag);
+                                continue;
+                        }
+                        reclaimable += pag->pag_ici_reclaimable;
+                        xfs_perag_put(pag);
+                }
+        }
+        up_read(&xfs_mount_list_lock);
+        return reclaimable;
+}
+static struct shrinker xfs_inode_shrinker = {
+        .shrink = xfs_reclaim_inode_shrink,
+        .seeks = DEFAULT_SEEKS,
+};
+void __init
+xfs_inode_shrinker_init(void)
+{
+        init_rwsem(&xfs_mount_list_lock);
+        register_shrinker(&xfs_inode_shrinker);
+}
+void
+xfs_inode_shrinker_destroy(void)
+{
+        ASSERT(list_empty(&xfs_mount_list));
+        unregister_shrinker(&xfs_inode_shrinker);
+}
+void
+xfs_inode_shrinker_register(
+        struct xfs_mount        *mp)
+{
+        down_write(&xfs_mount_list_lock);
+        list_add_tail(&mp->m_mplist, &xfs_mount_list);
+        up_write(&xfs_mount_list_lock);
+}
+void
+xfs_inode_shrinker_unregister(
+        struct xfs_mount        *mp)
+{
+        down_write(&xfs_mount_list_lock);
+        list_del(&mp->m_mplist);
+        up_write(&xfs_mount_list_lock);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index d480c346cabb..cdcbaaca9880 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -53,6 +53,11 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-        int flags, int tag, int write_lock);
+        int flags, int tag, int write_lock, int *nr_to_scan);
+void xfs_inode_shrinker_init(void);
+void xfs_inode_shrinker_destroy(void);
+void xfs_inode_shrinker_register(struct xfs_mount *mp);
+void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
 #endif
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 5a107601e969..207fa77f63ae 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -41,7 +41,6 @@
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
-#include "xfs_attr_sf.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_log_priv.h"
 #include "xfs_buf_item.h"
@@ -50,6 +49,9 @@
 #include "xfs_aops.h"
 #include "quota/xfs_dquot_item.h"
 #include "quota/xfs_dquot.h"
+#include "xfs_log_recover.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
 /*
 * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index fcaa62f0799e..8a319cfd2901 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -32,6 +32,10 @@ struct xfs_da_node_entry;
 struct xfs_dquot;
 struct xlog_ticket;
 struct log;
+struct xlog_recover;
+struct xlog_recover_item;
+struct xfs_buf_log_format;
+struct xfs_inode_log_format;
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
        TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -562,18 +566,21 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(int, count)
+                __field(int, pincount)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->count = atomic_read(&VFS_I(ip)->i_count);
+                __entry->pincount = atomic_read(&ip->i_pincount);
                __entry->caller_ip = caller_ip;
        ),
-        TP_printk("dev %d:%d ino 0x%llx count %d caller %pf",
+        TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->count,
+                  __entry->pincount,
                  (char *)__entry->caller_ip)
 )
@@ -583,6 +590,10 @@ DEFINE_EVENT(xfs_inode_class, name, \
        TP_ARGS(ip, caller_ip))
 DEFINE_INODE_EVENT(xfs_ihold);
 DEFINE_INODE_EVENT(xfs_irele);
+DEFINE_INODE_EVENT(xfs_inode_pin);
+DEFINE_INODE_EVENT(xfs_inode_unpin);
+DEFINE_INODE_EVENT(xfs_inode_unpin_nowait);
 /* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
 DEFINE_INODE_EVENT(xfs_inode);
 #define xfs_itrace_entry(ip)    \
@@ -642,8 +653,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \
        TP_PROTO(struct xfs_dquot *dqp), \
        TP_ARGS(dqp))
 DEFINE_DQUOT_EVENT(xfs_dqadjust);
-DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
-DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
@@ -658,7 +667,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_move);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
 DEFINE_DQUOT_EVENT(xfs_dqget_hit);
 DEFINE_DQUOT_EVENT(xfs_dqget_miss);
@@ -1495,6 +1503,140 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
+        TP_PROTO(struct log *log, struct xlog_recover *trans,
+                struct xlog_recover_item *item, int pass),
+        TP_ARGS(log, trans, item, pass),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(unsigned long, item)
+                __field(xlog_tid_t, tid)
+                __field(int, type)
+                __field(int, pass)
+                __field(int, count)
+                __field(int, total)
+        ),
+        TP_fast_assign(
+                __entry->dev = log->l_mp->m_super->s_dev;
+                __entry->item = (unsigned long)item;
+                __entry->tid = trans->r_log_tid;
+                __entry->type = ITEM_TYPE(item);
+                __entry->pass = pass;
+                __entry->count = item->ri_cnt;
+                __entry->total = item->ri_total;
+        ),
+        TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
+                  "item region count/total %d/%d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->tid,
+                  __entry->pass,
+                  (void *)__entry->item,
+                  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+                  __entry->count,
+                  __entry->total)
+)
+#define DEFINE_LOG_RECOVER_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_item_class, name, \
+        TP_PROTO(struct log *log, struct xlog_recover *trans, \
+                struct xlog_recover_item *item, int pass), \
+        TP_ARGS(log, trans, item, pass))
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
+DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
+        TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
+        TP_ARGS(log, buf_f),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(__int64_t, blkno)
+                __field(unsigned short, len)
+                __field(unsigned short, flags)
+                __field(unsigned short, size)
+                __field(unsigned int, map_size)
+        ),
+        TP_fast_assign(
+                __entry->dev = log->l_mp->m_super->s_dev;
+                __entry->blkno = buf_f->blf_blkno;
+                __entry->len = buf_f->blf_len;
+                __entry->flags = buf_f->blf_flags;
+                __entry->size = buf_f->blf_size;
+                __entry->map_size = buf_f->blf_map_size;
+        ),
+        TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
+                        "map_size %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->blkno,
+                  __entry->len,
+                  __entry->flags,
+                  __entry->size,
+                  __entry->map_size)
+)
+#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
+        TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
+        TP_ARGS(log, buf_f))
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
+DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
+        TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
+        TP_ARGS(log, in_f),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(unsigned short, size)
+                __field(int, fields)
+                __field(unsigned short, asize)
+                __field(unsigned short, dsize)
+                __field(__int64_t, blkno)
+                __field(int, len)
+                __field(int, boffset)
+        ),
+        TP_fast_assign(
+                __entry->dev = log->l_mp->m_super->s_dev;
+                __entry->ino = in_f->ilf_ino;
+                __entry->size = in_f->ilf_size;
+                __entry->fields = in_f->ilf_fields;
+                __entry->asize = in_f->ilf_asize;
+                __entry->dsize = in_f->ilf_dsize;
+                __entry->blkno = in_f->ilf_blkno;
+                __entry->len = in_f->ilf_len;
+                __entry->boffset = in_f->ilf_boffset;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
+                        "dsize %d, blkno 0x%llx, len %d, boffset %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->size,
+                  __entry->fields,
+                  __entry->asize,
+                  __entry->dsize,
+                  __entry->blkno,
+                  __entry->len,
+                  __entry->boffset)
+)
+#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
+        TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
+        TP_ARGS(log, in_f))
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
 #endif /* _TRACE_XFS_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 5f79dd78626b..b89ec5df0129 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
         * No need to re-initialize these if this is a reclaimed dquot.
         */
        if (brandnewdquot) {
-                dqp->dq_flnext = dqp->dq_flprev = dqp;
+                INIT_LIST_HEAD(&dqp->q_freelist);
                mutex_init(&dqp->q_qlock);
                init_waitqueue_head(&dqp->q_pinwait);
@@ -119,20 +119,20 @@ xfs_qm_dqinit(
                 * Only the q_core portion was zeroed in dqreclaim_one().
                 * So, we need to reset others.
                 */
-                 dqp->q_nrefs = 0;
+                dqp->q_nrefs = 0;
-                 dqp->q_blkno = 0;
+                dqp->q_blkno = 0;
-                 dqp->MPL_NEXT = dqp->HL_NEXT = NULL;
+                INIT_LIST_HEAD(&dqp->q_mplist);
-                 dqp->HL_PREVP = dqp->MPL_PREVP = NULL;
+                INIT_LIST_HEAD(&dqp->q_hashlist);
-                 dqp->q_bufoffset = 0;
+                dqp->q_bufoffset = 0;
-                 dqp->q_fileoffset = 0;
+                dqp->q_fileoffset = 0;
-                 dqp->q_transp = NULL;
+                dqp->q_transp = NULL;
-                 dqp->q_gdquot = NULL;
+                dqp->q_gdquot = NULL;
-                 dqp->q_res_bcount = 0;
+                dqp->q_res_bcount = 0;
-                 dqp->q_res_icount = 0;
+                dqp->q_res_icount = 0;
-                 dqp->q_res_rtbcount = 0;
+                dqp->q_res_rtbcount = 0;
-                 atomic_set(&dqp->q_pincount, 0);
+                atomic_set(&dqp->q_pincount, 0);
-                 dqp->q_hash = NULL;
+                dqp->q_hash = NULL;
-                 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
+                ASSERT(list_empty(&dqp->q_freelist));
                trace_xfs_dqreuse(dqp);
        }
@@ -158,7 +158,7 @@ void
 xfs_qm_dqdestroy(
        xfs_dquot_t     *dqp)
 {
-        ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
+        ASSERT(list_empty(&dqp->q_freelist));
        mutex_destroy(&dqp->q_qlock);
        sv_destroy(&dqp->q_pinwait);
@@ -252,7 +252,7 @@ xfs_qm_adjust_dqtimers(
                     (be64_to_cpu(d->d_bcount) >=
                      be64_to_cpu(d->d_blk_hardlimit)))) {
                        d->d_btimer = cpu_to_be32(get_seconds() +
-                                        XFS_QI_BTIMELIMIT(mp));
+                                        mp->m_quotainfo->qi_btimelimit);
                } else {
                        d->d_bwarns = 0;
                }
@@ -275,7 +275,7 @@ xfs_qm_adjust_dqtimers(
                     (be64_to_cpu(d->d_icount) >=
                      be64_to_cpu(d->d_ino_hardlimit)))) {
                        d->d_itimer = cpu_to_be32(get_seconds() +
-                                        XFS_QI_ITIMELIMIT(mp));
+                                        mp->m_quotainfo->qi_itimelimit);
                } else {
                        d->d_iwarns = 0;
                }
@@ -298,7 +298,7 @@ xfs_qm_adjust_dqtimers(
                     (be64_to_cpu(d->d_rtbcount) >=
                      be64_to_cpu(d->d_rtb_hardlimit)))) {
                        d->d_rtbtimer = cpu_to_be32(get_seconds() +
-                                        XFS_QI_RTBTIMELIMIT(mp));
+                                        mp->m_quotainfo->qi_rtbtimelimit);
                } else {
                        d->d_rtbwarns = 0;
                }
@@ -325,6 +325,7 @@ xfs_qm_init_dquot_blk(
        uint            type,
        xfs_buf_t       *bp)
 {
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
        xfs_dqblk_t     *d;
        int             curid, i;
@@ -337,16 +338,16 @@ xfs_qm_init_dquot_blk(
        /*
         * ID of the first dquot in the block - id's are zero based.
         */
-        curid = id - (id % XFS_QM_DQPERBLK(mp));
+        curid = id - (id % q->qi_dqperchunk);
        ASSERT(curid >= 0);
-        memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)));
+        memset(d, 0, BBTOB(q->qi_dqchunklen));
-        for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++)
+        for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
                xfs_qm_dqinit_core(curid, type, d);
        xfs_trans_dquot_buf(tp, bp,
                            (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
                            ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
                             XFS_BLI_GDQUOT_BUF)));
-        xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1);
+        xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
 }
@@ -419,7 +420,7 @@ xfs_qm_dqalloc(
        /* now we can just get the buffer (there's nothing to read yet) */
        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
                               dqp->q_blkno,
-                               XFS_QI_DQCHUNKLEN(mp),
+                               mp->m_quotainfo->qi_dqchunklen,
                               0);
        if (!bp || (error = XFS_BUF_GETERROR(bp)))
                goto error1;
@@ -500,7 +501,8 @@ xfs_qm_dqtobp(
         */
        if (dqp->q_blkno == (xfs_daddr_t) 0) {
                /* We use the id as an index */
-                dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp);
+                dqp->q_fileoffset = (xfs_fileoff_t)id /
+                                        mp->m_quotainfo->qi_dqperchunk;
                nmaps = 1;
                quotip = XFS_DQ_TO_QIP(dqp);
                xfs_ilock(quotip, XFS_ILOCK_SHARED);
@@ -529,7 +531,7 @@ xfs_qm_dqtobp(
                /*
                 * offset of dquot in the (fixed sized) dquot chunk.
                 */
-                dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) *
+                dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
                        sizeof(xfs_dqblk_t);
                if (map.br_startblock == HOLESTARTBLOCK) {
                        /*
@@ -559,15 +561,13 @@ xfs_qm_dqtobp(
         * Read in the buffer, unless we've just done the allocation
         * (in which case we already have the buf).
         */
-        if (! newdquot) {
+        if (!newdquot) {
                trace_xfs_dqtobp_read(dqp);
-                if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                                               dqp->q_blkno,
+                                           dqp->q_blkno,
-                                               XFS_QI_DQCHUNKLEN(mp),
+                                           mp->m_quotainfo->qi_dqchunklen,
-                                               0, &bp))) {
+                                           0, &bp);
-                        return (error);
-                }
                if (error || !bp)
                        return XFS_ERROR(error);
        }
@@ -689,14 +689,14 @@ xfs_qm_idtodq(
        tp = NULL;
        if (flags & XFS_QMOPT_DQALLOC) {
                tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
-                if ((error = xfs_trans_reserve(tp,
+                error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
-                                       XFS_QM_DQALLOC_SPACE_RES(mp),
+                                XFS_WRITE_LOG_RES(mp) +
-                                       XFS_WRITE_LOG_RES(mp) +
+                                BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
-                                              BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 +
+                                128,
-                                              128,
+                                0,
-                                       0,
+                                XFS_TRANS_PERM_LOG_RES,
-                                       XFS_TRANS_PERM_LOG_RES,
+                                XFS_WRITE_LOG_COUNT);
-                                       XFS_WRITE_LOG_COUNT))) {
+                if (error) {
                        cancelflags = 0;
                        goto error0;
                }
@@ -751,7 +751,6 @@ xfs_qm_dqlookup(
 {
        xfs_dquot_t             *dqp;
        uint                    flist_locked;
-        xfs_dquot_t             *d;
        ASSERT(mutex_is_locked(&qh->qh_lock));
@@ -760,7 +759,7 @@ xfs_qm_dqlookup(
        /*
         * Traverse the hashchain looking for a match
         */
-        for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) {
+        list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
                /*
                 * We already have the hashlock. We don't need the
                 * dqlock to look at the id field of the dquot, since the
@@ -772,12 +771,12 @@ xfs_qm_dqlookup(
                        /*
                         * All in core dquots must be on the dqlist of mp
                         */
-                        ASSERT(dqp->MPL_PREVP != NULL);
+                        ASSERT(!list_empty(&dqp->q_mplist));
                        xfs_dqlock(dqp);
                        if (dqp->q_nrefs == 0) {
-                                ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
+                                ASSERT(!list_empty(&dqp->q_freelist));
-                                if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+                                if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
                                        trace_xfs_dqlookup_want(dqp);
                                        /*
@@ -787,7 +786,7 @@ xfs_qm_dqlookup(
                                         */
                                        dqp->dq_flags |= XFS_DQ_WANT;
                                        xfs_dqunlock(dqp);
-                                        xfs_qm_freelist_lock(xfs_Gqm);
+                                        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
                                        xfs_dqlock(dqp);
                                        dqp->dq_flags &= ~(XFS_DQ_WANT);
                                }
@@ -802,46 +801,28 @@ xfs_qm_dqlookup(
                        if (flist_locked) {
                                if (dqp->q_nrefs != 0) {
-                                        xfs_qm_freelist_unlock(xfs_Gqm);
+                                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
                                        flist_locked = B_FALSE;
                                } else {
-                                        /*
+                                        /* take it off the freelist */
-                                         * take it off the freelist
-                                         */
                                        trace_xfs_dqlookup_freelist(dqp);
-                                        XQM_FREELIST_REMOVE(dqp);
+                                        list_del_init(&dqp->q_freelist);
-                                        /* xfs_qm_freelist_print(&(xfs_Gqm->
+                                        xfs_Gqm->qm_dqfrlist_cnt--;
-                                                        qm_dqfreelist),
-                                                        "after removal"); */
                                }
                        }
-                        /*
-                         * grab a reference
-                         */
                        XFS_DQHOLD(dqp);
                        if (flist_locked)
-                                xfs_qm_freelist_unlock(xfs_Gqm);
+                                mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
                        /*
                         * move the dquot to the front of the hashchain
                         */
                        ASSERT(mutex_is_locked(&qh->qh_lock));
-                        if (dqp->HL_PREVP != &qh->qh_next) {
+                        list_move(&dqp->q_hashlist, &qh->qh_list);
-                                trace_xfs_dqlookup_move(dqp);
-                                if ((d = dqp->HL_NEXT))
-                                        d->HL_PREVP = dqp->HL_PREVP;
-                                *(dqp->HL_PREVP) = d;
-                                d = qh->qh_next;
-                                d->HL_PREVP = &dqp->HL_NEXT;
-                                dqp->HL_NEXT = d;
-                                dqp->HL_PREVP = &qh->qh_next;
-                                qh->qh_next = dqp;
-                        }
                        trace_xfs_dqlookup_done(dqp);
                        *O_dqpp = dqp;
-                        ASSERT(mutex_is_locked(&qh->qh_lock));
+                        return 0;
-                        return (0);
                }
        }
@@ -975,16 +956,17 @@ xfs_qm_dqget(
         */
        if (ip) {
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                if (! XFS_IS_DQTYPE_ON(mp, type)) {
-                        /* inode stays locked on return */
-                        xfs_qm_dqdestroy(dqp);
-                        return XFS_ERROR(ESRCH);
-                }
                /*
                 * A dquot could be attached to this inode by now, since
                 * we had dropped the ilock.
                 */
                if (type == XFS_DQ_USER) {
+                        if (!XFS_IS_UQUOTA_ON(mp)) {
+                                /* inode stays locked on return */
+                                xfs_qm_dqdestroy(dqp);
+                                return XFS_ERROR(ESRCH);
+                        }
                        if (ip->i_udquot) {
                                xfs_qm_dqdestroy(dqp);
                                dqp = ip->i_udquot;
@@ -992,6 +974,11 @@ xfs_qm_dqget(
                                goto dqret;
                        }
                } else {
+                        if (!XFS_IS_OQUOTA_ON(mp)) {
+                                /* inode stays locked on return */
+                                xfs_qm_dqdestroy(dqp);
+                                return XFS_ERROR(ESRCH);
+                        }
                        if (ip->i_gdquot) {
                                xfs_qm_dqdestroy(dqp);
                                dqp = ip->i_gdquot;
@@ -1033,13 +1020,14 @@ xfs_qm_dqget(
         */
        ASSERT(mutex_is_locked(&h->qh_lock));
        dqp->q_hash = h;
-        XQM_HASHLIST_INSERT(h, dqp);
+        list_add(&dqp->q_hashlist, &h->qh_list);
+        h->qh_version++;
        /*
         * Attach this dquot to this filesystem's list of all dquots,
         * kept inside the mount structure in m_quotainfo field
         */
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
        /*
         * We return a locked dquot to the caller, with a reference taken
@@ -1047,9 +1035,9 @@ xfs_qm_dqget(
        xfs_dqlock(dqp);
        dqp->q_nrefs = 1;
-        XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
+        list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
+        mp->m_quotainfo->qi_dquots++;
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
        mutex_unlock(&h->qh_lock);
 dqret:
        ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1086,10 +1074,10 @@ xfs_qm_dqput(
         * drop the dqlock and acquire the freelist and dqlock
         * in the right order; but try to get it out-of-order first
         */
-        if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+        if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
                trace_xfs_dqput_wait(dqp);
                xfs_dqunlock(dqp);
-                xfs_qm_freelist_lock(xfs_Gqm);
+                mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
                xfs_dqlock(dqp);
        }
@@ -1100,10 +1088,8 @@ xfs_qm_dqput(
                if (--dqp->q_nrefs == 0) {
                        trace_xfs_dqput_free(dqp);
-                        /*
+                        list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
-                         * insert at end of the freelist.
+                        xfs_Gqm->qm_dqfrlist_cnt++;
-                         */
-                        XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
                        /*
                         * If we just added a udquot to the freelist, then
@@ -1118,10 +1104,6 @@ xfs_qm_dqput(
                                xfs_dqlock(gdqp);
                                dqp->q_gdquot = NULL;
                        }
-                        /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
-                           "@@@@@++ Free list (after append) @@@@@+");
-                           */
                }
                xfs_dqunlock(dqp);
@@ -1133,7 +1115,7 @@ xfs_qm_dqput(
                        break;
                dqp = gdqp;
        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
+        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
 }
 /*
@@ -1386,10 +1368,10 @@ int
 xfs_qm_dqpurge(
        xfs_dquot_t     *dqp)
 {
-        xfs_dqhash_t    *thishash;
+        xfs_dqhash_t    *qh = dqp->q_hash;
        xfs_mount_t     *mp = dqp->q_mount;
-        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
        ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
        xfs_dqlock(dqp);
@@ -1407,7 +1389,7 @@ xfs_qm_dqpurge(
                return (1);
        }
-        ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+        ASSERT(!list_empty(&dqp->q_freelist));
        /*
         * If we're turning off quotas, we have to make sure that, for
@@ -1452,14 +1434,16 @@ xfs_qm_dqpurge(
        ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
               !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
-        thishash = dqp->q_hash;
+        list_del_init(&dqp->q_hashlist);
-        XQM_HASHLIST_REMOVE(thishash, dqp);
+        qh->qh_version++;
-        XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp);
+        list_del_init(&dqp->q_mplist);
+        mp->m_quotainfo->qi_dqreclaims++;
+        mp->m_quotainfo->qi_dquots--;
        /*
         * XXX Move this to the front of the freelist, if we can get the
         * freelist lock.
         */
-        ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+        ASSERT(!list_empty(&dqp->q_freelist));
        dqp->q_mount = NULL;
        dqp->q_hash = NULL;
@@ -1467,7 +1451,7 @@ xfs_qm_dqpurge(
        memset(&dqp->q_core, 0, sizeof(dqp->q_core));
        xfs_dqfunlock(dqp);
        xfs_dqunlock(dqp);
-        mutex_unlock(&thishash->qh_lock);
+        mutex_unlock(&qh->qh_lock);
        return (0);
 }
@@ -1517,6 +1501,7 @@ void
 xfs_qm_dqflock_pushbuf_wait(
        xfs_dquot_t     *dqp)
 {
+        xfs_mount_t     *mp = dqp->q_mount;
        xfs_buf_t       *bp;
        /*
@@ -1525,14 +1510,14 @@ xfs_qm_dqflock_pushbuf_wait(
         * out immediately.  We'll be able to acquire
         * the flush lock when the I/O completes.
         */
-        bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
+        bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
-                    XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK);
+                        mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
        if (!bp)
                goto out_lock;
        if (XFS_BUF_ISDELAYWRITE(bp)) {
                if (XFS_BUF_ISPINNED(bp))
-                        xfs_log_force(dqp->q_mount, 0);
+                        xfs_log_force(mp, 0);
                xfs_buf_delwri_promote(bp);
                wake_up_process(bp->b_target->bt_task);
        }
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index a0f7da586d1b..5da3a23b820d 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -33,40 +33,23 @@
 * The hash chain headers (hash buckets)
 */
 typedef struct xfs_dqhash {
-        struct xfs_dquot *qh_next;
+        struct list_head  qh_list;
        struct mutex      qh_lock;
        uint              qh_version;   /* ever increasing version */
        uint              qh_nelems;    /* number of dquots on the list */
 } xfs_dqhash_t;
-typedef struct xfs_dqlink {
-        struct xfs_dquot  *ql_next;     /* forward link */
-        struct xfs_dquot **ql_prevp;    /* pointer to prev ql_next */
-} xfs_dqlink_t;
 struct xfs_mount;
 struct xfs_trans;
 /*
- * This is the marker which is designed to occupy the first few
- * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
- * must come first.
- * This serves as the marker ("sentinel") when we have to restart list
- * iterations because of locking considerations.
- */
-typedef struct xfs_dqmarker {
-        struct xfs_dquot*dqm_flnext;    /* link to freelist: must be first */
-        struct xfs_dquot*dqm_flprev;
-        xfs_dqlink_t     dqm_mplist;    /* link to mount's list of dquots */
-        xfs_dqlink_t     dqm_hashlist;  /* link to the hash chain */
-        uint             dqm_flags;     /* various flags (XFS_DQ_*) */
-} xfs_dqmarker_t;
-/*
 * The incore dquot structure
 */
 typedef struct xfs_dquot {
-        xfs_dqmarker_t   q_lists;       /* list ptrs, q_flags (marker) */
+        uint             dq_flags;      /* various flags (XFS_DQ_*) */
+        struct list_head q_freelist;    /* global free list of dquots */
+        struct list_head q_mplist;      /* mount's list of dquots */
+        struct list_head q_hashlist;    /* gloabl hash list of dquots */
        xfs_dqhash_t    *q_hash;        /* the hashchain header */
        struct xfs_mount*q_mount;       /* filesystem this relates to */
        struct xfs_trans*q_transp;      /* trans this belongs to currently */
@@ -87,13 +70,6 @@ typedef struct xfs_dquot {
        wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
 } xfs_dquot_t;
-#define dq_flnext       q_lists.dqm_flnext
-#define dq_flprev       q_lists.dqm_flprev
-#define dq_mplist       q_lists.dqm_mplist
-#define dq_hashlist     q_lists.dqm_hashlist
-#define dq_flags        q_lists.dqm_flags
 /*
 * Lock hierarchy for q_qlock:
 *      XFS_QLOCK_NORMAL is the implicit default,
@@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
 }
 #define XFS_DQ_IS_LOCKED(dqp)   (mutex_is_locked(&((dqp)->q_qlock)))
-#define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)    ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_USER)
 #define XFS_QM_ISPDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_PROJ)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 4e4ee9a57194..8d89a24ae324 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -107,8 +107,7 @@ xfs_qm_dquot_logitem_pin(
 /* ARGSUSED */
 STATIC void
 xfs_qm_dquot_logitem_unpin(
-        xfs_dq_logitem_t *logitem,
+        xfs_dq_logitem_t *logitem)
-        int               stale)
 {
        xfs_dquot_t *dqp = logitem->qli_dquot;
@@ -123,7 +122,7 @@ xfs_qm_dquot_logitem_unpin_remove(
        xfs_dq_logitem_t *logitem,
        xfs_trans_t      *tp)
 {
-        xfs_qm_dquot_logitem_unpin(logitem, 0);
+        xfs_qm_dquot_logitem_unpin(logitem);
 }
 /*
@@ -228,7 +227,7 @@ xfs_qm_dquot_logitem_pushbuf(
        }
        mp = dqp->q_mount;
        bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
-                    XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK);
+                        mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
        xfs_dqunlock(dqp);
        if (!bp)
                return;
@@ -329,8 +328,7 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_qm_dquot_logitem_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin,
-                                        xfs_qm_dquot_logitem_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
                                        xfs_qm_dquot_logitem_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))
@@ -357,9 +355,8 @@ xfs_qm_dquot_logitem_init(
        xfs_dq_logitem_t  *lp;
        lp = &dqp->q_logitem;
-        lp->qli_item.li_type = XFS_LI_DQUOT;
+        xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
-        lp->qli_item.li_ops = &xfs_dquot_item_ops;
+                                        &xfs_dquot_item_ops);
-        lp->qli_item.li_mountp = dqp->q_mount;
        lp->qli_dquot = dqp;
        lp->qli_format.qlf_type = XFS_LI_DQUOT;
        lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
@@ -426,7 +423,7 @@ xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
 */
 /*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale)
+xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf)
 {
        return;
 }
@@ -537,8 +534,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_qm_qoff_logitem_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t* ,int))
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
-                                        xfs_qm_qoff_logitem_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
                                        xfs_qm_qoff_logitem_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -559,8 +555,7 @@ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_qm_qoff_logitem_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
-                                        xfs_qm_qoff_logitem_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
                                        xfs_qm_qoff_logitem_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -586,11 +581,8 @@ xfs_qm_qoff_logitem_init(
        qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
-        qf->qql_item.li_type = XFS_LI_QUOTAOFF;
+        xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
-        if (start)
+                        &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
-                qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
-        else
-                qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
        qf->qql_item.li_mountp = mp;
        qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
        qf->qql_format.qf_flags = flags;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 417e61e3d9dd..38e764146644 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -67,9 +67,6 @@ static cred_t	xfs_zerocr;
 STATIC void     xfs_qm_list_init(xfs_dqlist_t *, char *, int);
 STATIC void     xfs_qm_list_destroy(xfs_dqlist_t *);
-STATIC void     xfs_qm_freelist_init(xfs_frlist_t *);
-STATIC void     xfs_qm_freelist_destroy(xfs_frlist_t *);
 STATIC int      xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int      xfs_qm_init_quotainfo(xfs_mount_t *);
 STATIC int      xfs_qm_shake(int, gfp_t);
@@ -84,21 +81,25 @@ extern struct mutex	qcheck_lock;
 #endif
 #ifdef QUOTADEBUG
-#define XQM_LIST_PRINT(l, NXT, title) \
+static void
-{ \
+xfs_qm_dquot_list_print(
-        xfs_dquot_t     *dqp; int i = 0; \
+        struct xfs_mount *mp)
-        cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
+{
-        for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \
+        xfs_dquot_t     *dqp;
-                cmn_err(CE_DEBUG, "   %d.  \"%d (%s)\"   " \
+        int             i = 0;
-                                  "bcnt = %d, icnt = %d, refs = %d", \
-                        ++i, (int) be32_to_cpu(dqp->q_core.d_id), \
+        list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
-                        DQFLAGTO_TYPESTR(dqp),       \
+                cmn_err(CE_DEBUG, "   %d. \"%d (%s)\"   "
-                        (int) be64_to_cpu(dqp->q_core.d_bcount), \
+                                  "bcnt = %lld, icnt = %lld, refs = %d",
-                        (int) be64_to_cpu(dqp->q_core.d_icount), \
+                        i++, be32_to_cpu(dqp->q_core.d_id),
-                        (int) dqp->q_nrefs);  } \
+                        DQFLAGTO_TYPESTR(dqp),
+                        (long long)be64_to_cpu(dqp->q_core.d_bcount),
+                        (long long)be64_to_cpu(dqp->q_core.d_icount),
+                        dqp->q_nrefs);
+        }
 }
 #else
-#define XQM_LIST_PRINT(l, NXT, title) do { } while (0)
+static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { }
 #endif
 /*
@@ -144,7 +145,9 @@ xfs_Gqm_init(void)
        /*
         * Freelist of all dquots of all file systems
         */
-        xfs_qm_freelist_init(&(xqm->qm_dqfreelist));
+        INIT_LIST_HEAD(&xqm->qm_dqfrlist);
+        xqm->qm_dqfrlist_cnt = 0;
+        mutex_init(&xqm->qm_dqfrlist_lock);
        /*
         * dquot zone. we register our own low-memory callback.
@@ -189,6 +192,7 @@ STATIC void
 xfs_qm_destroy(
        struct xfs_qm   *xqm)
 {
+        struct xfs_dquot *dqp, *n;
        int             hsize, i;
        ASSERT(xqm != NULL);
@@ -204,7 +208,21 @@ xfs_qm_destroy(
        xqm->qm_usr_dqhtable = NULL;
        xqm->qm_grp_dqhtable = NULL;
        xqm->qm_dqhashmask = 0;
-        xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist));
+        /* frlist cleanup */
+        mutex_lock(&xqm->qm_dqfrlist_lock);
+        list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
+                xfs_dqlock(dqp);
+#ifdef QUOTADEBUG
+                cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
+#endif
+                list_del_init(&dqp->q_freelist);
+                xfs_Gqm->qm_dqfrlist_cnt--;
+                xfs_dqunlock(dqp);
+                xfs_qm_dqdestroy(dqp);
+        }
+        mutex_unlock(&xqm->qm_dqfrlist_lock);
+        mutex_destroy(&xqm->qm_dqfrlist_lock);
 #ifdef DEBUG
        mutex_destroy(&qcheck_lock);
 #endif
@@ -256,7 +274,7 @@ STATIC void
 xfs_qm_rele_quotafs_ref(
        struct xfs_mount *mp)
 {
-        xfs_dquot_t     *dqp, *nextdqp;
+        xfs_dquot_t     *dqp, *n;
        ASSERT(xfs_Gqm);
        ASSERT(xfs_Gqm->qm_nrefs > 0);
@@ -264,26 +282,24 @@ xfs_qm_rele_quotafs_ref(
        /*
         * Go thru the freelist and destroy all inactive dquots.
         */
-        xfs_qm_freelist_lock(xfs_Gqm);
+        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-        for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+        list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
-             dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
                xfs_dqlock(dqp);
-                nextdqp = dqp->dq_flnext;
                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
                        ASSERT(dqp->q_mount == NULL);
                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                        ASSERT(dqp->HL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_hashlist));
-                        ASSERT(dqp->MPL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_mplist));
-                        XQM_FREELIST_REMOVE(dqp);
+                        list_del_init(&dqp->q_freelist);
+                        xfs_Gqm->qm_dqfrlist_cnt--;
                        xfs_dqunlock(dqp);
                        xfs_qm_dqdestroy(dqp);
                } else {
                        xfs_dqunlock(dqp);
                }
-                dqp = nextdqp;
        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
+        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
        /*
         * Destroy the entire XQM. If somebody mounts with quotaon, this'll
@@ -305,7 +321,7 @@ xfs_qm_unmount(
        struct xfs_mount        *mp)
 {
        if (mp->m_quotainfo) {
-                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
+                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
                xfs_qm_destroy_quotainfo(mp);
        }
 }
@@ -449,20 +465,21 @@ xfs_qm_unmount_quotas(
 */
 STATIC int
 xfs_qm_dqflush_all(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        int             sync_mode)
+        int                     sync_mode)
 {
-        int             recl;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        xfs_dquot_t     *dqp;
+        int                     recl;
-        int             niters;
+        struct xfs_dquot        *dqp;
-        int             error;
+        int                     niters;
+        int                     error;
-        if (mp->m_quotainfo == NULL)
+        if (!q)
                return 0;
        niters = 0;
 again:
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&q->qi_dqlist_lock);
-        FOREACH_DQUOT_IN_MP(dqp, mp) {
+        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
                xfs_dqlock(dqp);
                if (! XFS_DQ_IS_DIRTY(dqp)) {
                        xfs_dqunlock(dqp);
@@ -470,7 +487,7 @@ again:
                }
                /* XXX a sentinel would be better */
-                recl = XFS_QI_MPLRECLAIMS(mp);
+                recl = q->qi_dqreclaims;
                if (!xfs_dqflock_nowait(dqp)) {
                        /*
                         * If we can't grab the flush lock then check
@@ -485,21 +502,21 @@ again:
                 * Let go of the mplist lock. We don't want to hold it
                 * across a disk write.
                 */
-                xfs_qm_mplist_unlock(mp);
+                mutex_unlock(&q->qi_dqlist_lock);
                error = xfs_qm_dqflush(dqp, sync_mode);
                xfs_dqunlock(dqp);
                if (error)
                        return error;
-                xfs_qm_mplist_lock(mp);
+                mutex_lock(&q->qi_dqlist_lock);
-                if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+                if (recl != q->qi_dqreclaims) {
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        /* XXX restart limit */
                        goto again;
                }
        }
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&q->qi_dqlist_lock);
        /* return ! busy */
        return 0;
 }
@@ -509,15 +526,15 @@ again:
 */
 STATIC void
 xfs_qm_detach_gdquots(
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        xfs_dquot_t     *dqp, *gdqp;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        int             nrecl;
+        struct xfs_dquot        *dqp, *gdqp;
+        int                     nrecl;
 again:
-        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
-        dqp = XFS_QI_MPLNEXT(mp);
+        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
-        while (dqp) {
                xfs_dqlock(dqp);
                if ((gdqp = dqp->q_gdquot)) {
                        xfs_dqlock(gdqp);
@@ -530,15 +547,14 @@ xfs_qm_detach_gdquots(
                         * Can't hold the mplist lock across a dqput.
                         * XXXmust convert to marker based iterations here.
                         */
-                        nrecl = XFS_QI_MPLRECLAIMS(mp);
+                        nrecl = q->qi_dqreclaims;
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        xfs_qm_dqput(gdqp);
-                        xfs_qm_mplist_lock(mp);
+                        mutex_lock(&q->qi_dqlist_lock);
-                        if (nrecl != XFS_QI_MPLRECLAIMS(mp))
+                        if (nrecl != q->qi_dqreclaims)
                                goto again;
                }
-                dqp = dqp->MPL_NEXT;
        }
 }
@@ -550,23 +566,23 @@ xfs_qm_detach_gdquots(
 */
 STATIC int
 xfs_qm_dqpurge_int(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        uint            flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */
+        uint                    flags)
 {
-        xfs_dquot_t     *dqp;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        uint            dqtype;
+        struct xfs_dquot        *dqp, *n;
-        int             nrecl;
+        uint                    dqtype;
-        xfs_dquot_t     *nextdqp;
+        int                     nrecl;
-        int             nmisses;
+        int                     nmisses;
-        if (mp->m_quotainfo == NULL)
+        if (!q)
                return 0;
        dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
        dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
        dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&q->qi_dqlist_lock);
        /*
         * In the first pass through all incore dquots of this filesystem,
@@ -578,28 +594,25 @@ xfs_qm_dqpurge_int(
      again:
        nmisses = 0;
-        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
        /*
         * Try to get rid of all of the unwanted dquots. The idea is to
         * get them off mplist and hashlist, but leave them on freelist.
         */
-        dqp = XFS_QI_MPLNEXT(mp);
+        list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
-        while (dqp) {
                /*
                 * It's OK to look at the type without taking dqlock here.
                 * We're holding the mplist lock here, and that's needed for
                 * a dqreclaim.
                 */
-                if ((dqp->dq_flags & dqtype) == 0) {
+                if ((dqp->dq_flags & dqtype) == 0)
-                        dqp = dqp->MPL_NEXT;
                        continue;
-                }
                if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
-                        nrecl = XFS_QI_MPLRECLAIMS(mp);
+                        nrecl = q->qi_dqreclaims;
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        mutex_lock(&dqp->q_hash->qh_lock);
-                        xfs_qm_mplist_lock(mp);
+                        mutex_lock(&q->qi_dqlist_lock);
                        /*
                         * XXXTheoretically, we can get into a very long
@@ -607,7 +620,7 @@ xfs_qm_dqpurge_int(
                         * No one can be adding dquots to the mplist at
                         * this point, but somebody might be taking things off.
                         */
-                        if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
+                        if (nrecl != q->qi_dqreclaims) {
                                mutex_unlock(&dqp->q_hash->qh_lock);
                                goto again;
                        }
@@ -617,11 +630,9 @@ xfs_qm_dqpurge_int(
                 * Take the dquot off the mplist and hashlist. It may remain on
                 * freelist in INACTIVE state.
                 */
-                nextdqp = dqp->MPL_NEXT;
                nmisses += xfs_qm_dqpurge(dqp);
-                dqp = nextdqp;
        }
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&q->qi_dqlist_lock);
        return nmisses;
 }
@@ -921,12 +932,13 @@ xfs_qm_dqdetach(
 int
 xfs_qm_sync(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        int             flags)
+        int                     flags)
 {
-        int             recl, restarts;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        xfs_dquot_t     *dqp;
+        int                     recl, restarts;
-        int             error;
+        struct xfs_dquot        *dqp;
+        int                     error;
        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
@@ -934,18 +946,19 @@ xfs_qm_sync(
        restarts = 0;
  again:
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&q->qi_dqlist_lock);
        /*
         * dqpurge_all() also takes the mplist lock and iterate thru all dquots
         * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
         * when we have the mplist lock, we know that dquots will be consistent
         * as long as we have it locked.
         */
-        if (! XFS_IS_QUOTA_ON(mp)) {
+        if (!XFS_IS_QUOTA_ON(mp)) {
-                xfs_qm_mplist_unlock(mp);
+                mutex_unlock(&q->qi_dqlist_lock);
                return 0;
        }
-        FOREACH_DQUOT_IN_MP(dqp, mp) {
+        ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
                /*
                 * If this is vfs_sync calling, then skip the dquots that
                 * don't 'seem' to be dirty. ie. don't acquire dqlock.
@@ -969,7 +982,7 @@ xfs_qm_sync(
                }
                /* XXX a sentinel would be better */
-                recl = XFS_QI_MPLRECLAIMS(mp);
+                recl = q->qi_dqreclaims;
                if (!xfs_dqflock_nowait(dqp)) {
                        if (flags & SYNC_TRYLOCK) {
                                xfs_dqunlock(dqp);
@@ -989,7 +1002,7 @@ xfs_qm_sync(
                 * Let go of the mplist lock. We don't want to hold it
                 * across a disk write
                 */
-                xfs_qm_mplist_unlock(mp);
+                mutex_unlock(&q->qi_dqlist_lock);
                error = xfs_qm_dqflush(dqp, flags);
                xfs_dqunlock(dqp);
                if (error && XFS_FORCED_SHUTDOWN(mp))
@@ -997,17 +1010,17 @@ xfs_qm_sync(
                else if (error)
                        return error;
-                xfs_qm_mplist_lock(mp);
+                mutex_lock(&q->qi_dqlist_lock);
-                if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+                if (recl != q->qi_dqreclaims) {
                        if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
                                break;
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        goto again;
                }
        }
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&q->qi_dqlist_lock);
        return 0;
 }
@@ -1052,8 +1065,9 @@ xfs_qm_init_quotainfo(
                return error;
        }
-        xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
+        INIT_LIST_HEAD(&qinf->qi_dqlist);
-        lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class);
+        mutex_init(&qinf->qi_dqlist_lock);
+        lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
        qinf->qi_dqreclaims = 0;
@@ -1150,7 +1164,8 @@ xfs_qm_destroy_quotainfo(
         */
        xfs_qm_rele_quotafs_ref(mp);
-        xfs_qm_list_destroy(&qi->qi_dqlist);
+        ASSERT(list_empty(&qi->qi_dqlist));
+        mutex_destroy(&qi->qi_dqlist_lock);
        if (qi->qi_uquotaip) {
                IRELE(qi->qi_uquotaip);
@@ -1177,7 +1192,7 @@ xfs_qm_list_init(
        int             n)
 {
        mutex_init(&list->qh_lock);
-        list->qh_next = NULL;
+        INIT_LIST_HEAD(&list->qh_list);
        list->qh_version = 0;
        list->qh_nelems = 0;
 }
@@ -1316,9 +1331,6 @@ xfs_qm_qino_alloc(
         */
        spin_lock(&mp->m_sb_lock);
        if (flags & XFS_QMOPT_SBVERSION) {
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-                unsigned oldv = mp->m_sb.sb_versionnum;
-#endif
                ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
                ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
                                   XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
@@ -1331,11 +1343,6 @@ xfs_qm_qino_alloc(
                /* qflags will get updated _after_ quotacheck */
                mp->m_sb.sb_qflags = 0;
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-                cmn_err(CE_NOTE,
-                        "Old superblock version %x, converting to %x.",
-                        oldv, mp->m_sb.sb_versionnum);
-#endif
        }
        if (flags & XFS_QMOPT_UQUOTA)
                mp->m_sb.sb_uquotino = (*ip)->i_ino;
@@ -1371,10 +1378,10 @@ xfs_qm_reset_dqcounts(
 #ifdef DEBUG
        j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
        do_div(j, sizeof(xfs_dqblk_t));
-        ASSERT(XFS_QM_DQPERBLK(mp) == j);
+        ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
 #endif
        ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
-        for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) {
+        for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
                /*
                 * Do a sanity check, and if needed, repair the dqblk. Don't
                 * output any warnings because it's perfectly possible to
@@ -1429,7 +1436,7 @@ xfs_qm_dqiter_bufs(
        while (blkcnt--) {
                error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
                              XFS_FSB_TO_DADDR(mp, bno),
-                              (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp);
+                              mp->m_quotainfo->qi_dqchunklen, 0, &bp);
                if (error)
                        break;
@@ -1439,7 +1446,7 @@ xfs_qm_dqiter_bufs(
                 * goto the next block.
                 */
                bno++;
-                firstid += XFS_QM_DQPERBLK(mp);
+                firstid += mp->m_quotainfo->qi_dqperchunk;
        }
        return error;
 }
@@ -1505,7 +1512,7 @@ xfs_qm_dqiterate(
                                continue;
                        firstid = (xfs_dqid_t) map[i].br_startoff *
-                                XFS_QM_DQPERBLK(mp);
+                                mp->m_quotainfo->qi_dqperchunk;
                        /*
                         * Do a read-ahead on the next extent.
                         */
@@ -1516,7 +1523,7 @@ xfs_qm_dqiterate(
                                while (rablkcnt--) {
                                        xfs_baread(mp->m_ddev_targp,
                                               XFS_FSB_TO_DADDR(mp, rablkno),
-                                               (int)XFS_QI_DQCHUNKLEN(mp));
+                                               mp->m_quotainfo->qi_dqchunklen);
                                        rablkno++;
                                }
                        }
@@ -1576,8 +1583,10 @@ xfs_qm_quotacheck_dqadjust(
        /*
         * Set default limits, adjust timers (since we changed usages)
+         *
+         * There are no timers for the default values set in the root dquot.
         */
-        if (! XFS_IS_SUSER_DQUOT(dqp)) {
+        if (dqp->q_core.d_id) {
                xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
                xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
        }
@@ -1747,14 +1756,14 @@ xfs_qm_quotacheck(
        lastino = 0;
        flags = 0;
-        ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp));
+        ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        /*
         * There should be no cached dquots. The (simplistic) quotacheck
         * algorithm doesn't like that.
         */
-        ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0);
+        ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
        cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
@@ -1763,15 +1772,19 @@ xfs_qm_quotacheck(
         * their counters to zero. We need a clean slate.
         * We don't log our changes till later.
         */
-        if ((uip = XFS_QI_UQIP(mp))) {
+        uip = mp->m_quotainfo->qi_uquotaip;
-                if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA)))
+        if (uip) {
+                error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
+                if (error)
                        goto error_return;
                flags |= XFS_UQUOTA_CHKD;
        }
-        if ((gip = XFS_QI_GQIP(mp))) {
+        gip = mp->m_quotainfo->qi_gquotaip;
-                if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+        if (gip) {
-                                        XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA)))
+                error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+                                        XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+                if (error)
                        goto error_return;
                flags |= XFS_OQUOTA_CHKD;
        }
@@ -1804,7 +1817,7 @@ xfs_qm_quotacheck(
         * at this point (because we intentionally didn't in dqget_noattach).
         */
        if (error) {
-                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
+                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
                goto error_return;
        }
@@ -1825,7 +1838,7 @@ xfs_qm_quotacheck(
        mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
        mp->m_qflags |= flags;
-        XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++");
+        xfs_qm_dquot_list_print(mp);
 error_return:
        if (error) {
@@ -1920,59 +1933,53 @@ xfs_qm_init_quotainos(
                }
        }
-        XFS_QI_UQIP(mp) = uip;
+        mp->m_quotainfo->qi_uquotaip = uip;
-        XFS_QI_GQIP(mp) = gip;
+        mp->m_quotainfo->qi_gquotaip = gip;
        return 0;
 }
 /*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * Just pop the least recently used dquot off the freelist and
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * recycle it. The returned dquot is locked.
- * favor the lookup function ...
- * XXXsup merge this with qm_reclaim_one().
 */
-STATIC int
+STATIC xfs_dquot_t *
-xfs_qm_shake_freelist(
+xfs_qm_dqreclaim_one(void)
-        int howmany)
 {
-        int             nreclaimed;
+        xfs_dquot_t     *dqpout;
-        xfs_dqhash_t    *hash;
+        xfs_dquot_t     *dqp;
-        xfs_dquot_t     *dqp, *nextdqp;
        int             restarts;
-        int             nflushes;
-        if (howmany <= 0)
-                return 0;
-        nreclaimed = 0;
        restarts = 0;
-        nflushes = 0;
+        dqpout = NULL;
-#ifdef QUOTADEBUG
+        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-        cmn_err(CE_DEBUG, "Shake free 0x%x", howmany);
+startagain:
-#endif
+        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-        /* lock order is : hashchainlock, freelistlock, mplistlock */
- tryagain:
-        xfs_qm_freelist_lock(xfs_Gqm);
-        for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+        list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
-             ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) &&
+                struct xfs_mount *mp = dqp->q_mount;
-              nreclaimed < howmany); ) {
                xfs_dqlock(dqp);
                /*
                 * We are racing with dqlookup here. Naturally we don't
-                 * want to reclaim a dquot that lookup wants.
+                 * want to reclaim a dquot that lookup wants. We release the
+                 * freelist lock and start over, so that lookup will grab
+                 * both the dquot and the freelistlock.
                 */
                if (dqp->dq_flags & XFS_DQ_WANT) {
+                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
+                        trace_xfs_dqreclaim_want(dqp);
                        xfs_dqunlock(dqp);
-                        xfs_qm_freelist_unlock(xfs_Gqm);
+                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return nreclaimed;
+                                return NULL;
                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                        goto tryagain;
+                        goto startagain;
                }
                /*
@@ -1981,23 +1988,27 @@ xfs_qm_shake_freelist(
                 * life easier.
                 */
                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-                        ASSERT(dqp->q_mount == NULL);
+                        ASSERT(mp == NULL);
                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                        ASSERT(dqp->HL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_hashlist));
-                        ASSERT(dqp->MPL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_mplist));
+                        list_del_init(&dqp->q_freelist);
+                        xfs_Gqm->qm_dqfrlist_cnt--;
+                        xfs_dqunlock(dqp);
+                        dqpout = dqp;
                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                        nextdqp = dqp->dq_flnext;
+                        break;
-                        goto off_freelist;
                }
-                ASSERT(dqp->MPL_PREVP);
+                ASSERT(dqp->q_hash);
+                ASSERT(!list_empty(&dqp->q_mplist));
                /*
                 * Try to grab the flush lock. If this dquot is in the process of
                 * getting flushed to disk, we don't want to reclaim it.
                 */
                if (!xfs_dqflock_nowait(dqp)) {
                        xfs_dqunlock(dqp);
-                        dqp = dqp->dq_flnext;
                        continue;
                }
@@ -2010,21 +2021,21 @@ xfs_qm_shake_freelist(
                if (XFS_DQ_IS_DIRTY(dqp)) {
                        int     error;
-                        trace_xfs_dqshake_dirty(dqp);
+                        trace_xfs_dqreclaim_dirty(dqp);
                        /*
                         * We flush it delayed write, so don't bother
-                         * releasing the mplock.
+                         * releasing the freelist lock.
                         */
                        error = xfs_qm_dqflush(dqp, 0);
                        if (error) {
-                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                                xfs_fs_cmn_err(CE_WARN, mp,
-                        "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
+                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
                        }
                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-                        dqp = dqp->dq_flnext;
                        continue;
                }
                /*
                 * We're trying to get the hashlock out of order. This races
                 * with dqlookup; so, we giveup and goto the next dquot if
@@ -2033,56 +2044,74 @@ xfs_qm_shake_freelist(
                 * waiting for the freelist lock.
                 */
                if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
-                        xfs_dqfunlock(dqp);
+                        restarts++;
-                        xfs_dqunlock(dqp);
+                        goto dqfunlock;
-                        dqp = dqp->dq_flnext;
-                        continue;
                }
                /*
                 * This races with dquot allocation code as well as dqflush_all
                 * and reclaim code. So, if we failed to grab the mplist lock,
                 * giveup everything and start over.
                 */
-                hash = dqp->q_hash;
+                if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
-                ASSERT(hash);
+                        restarts++;
-                if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
+                        mutex_unlock(&dqp->q_hash->qh_lock);
-                        /* XXX put a sentinel so that we can come back here */
                        xfs_dqfunlock(dqp);
                        xfs_dqunlock(dqp);
-                        mutex_unlock(&hash->qh_lock);
+                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        xfs_qm_freelist_unlock(xfs_Gqm);
+                        if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+                                return NULL;
-                                return nreclaimed;
+                        goto startagain;
-                        goto tryagain;
                }
-                trace_xfs_dqshake_unlink(dqp);
-#ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
-                        dqp, be32_to_cpu(dqp->q_core.d_id));
-#endif
                ASSERT(dqp->q_nrefs == 0);
-                nextdqp = dqp->dq_flnext;
+                list_del_init(&dqp->q_mplist);
-                XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
+                mp->m_quotainfo->qi_dquots--;
-                XQM_HASHLIST_REMOVE(hash, dqp);
+                mp->m_quotainfo->qi_dqreclaims++;
+                list_del_init(&dqp->q_hashlist);
+                dqp->q_hash->qh_version++;
+                list_del_init(&dqp->q_freelist);
+                xfs_Gqm->qm_dqfrlist_cnt--;
+                dqpout = dqp;
+                mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+                mutex_unlock(&dqp->q_hash->qh_lock);
+dqfunlock:
                xfs_dqfunlock(dqp);
-                xfs_qm_mplist_unlock(dqp->q_mount);
-                mutex_unlock(&hash->qh_lock);
- off_freelist:
-                XQM_FREELIST_REMOVE(dqp);
                xfs_dqunlock(dqp);
-                nreclaimed++;
+                if (dqpout)
-                XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims);
+                        break;
+                if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+                        return NULL;
+        }
+        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+        return dqpout;
+}
+/*
+ * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * favor the lookup function ...
+ */
+STATIC int
+xfs_qm_shake_freelist(
+        int     howmany)
+{
+        int             nreclaimed = 0;
+        xfs_dquot_t     *dqp;
+        if (howmany <= 0)
+                return 0;
+        while (nreclaimed < howmany) {
+                dqp = xfs_qm_dqreclaim_one();
+                if (!dqp)
+                        return nreclaimed;
                xfs_qm_dqdestroy(dqp);
-                dqp = nextdqp;
+                nreclaimed++;
        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
        return nreclaimed;
 }
 /*
 * The kmem_shake interface is invoked when memory is running low.
 */
@@ -2097,7 +2126,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
        if (!xfs_Gqm)
                return 0;
-        nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */
+        nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
        /* incore dquots in all f/s's */
        ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
@@ -2113,131 +2142,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
 }
-/*
- * Just pop the least recently used dquot off the freelist and
- * recycle it. The returned dquot is locked.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqreclaim_one(void)
-{
-        xfs_dquot_t     *dqpout;
-        xfs_dquot_t     *dqp;
-        int             restarts;
-        int             nflushes;
-        restarts = 0;
-        dqpout = NULL;
-        nflushes = 0;
-        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
- startagain:
-        xfs_qm_freelist_lock(xfs_Gqm);
-        FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
-                xfs_dqlock(dqp);
-                /*
-                 * We are racing with dqlookup here. Naturally we don't
-                 * want to reclaim a dquot that lookup wants. We release the
-                 * freelist lock and start over, so that lookup will grab
-                 * both the dquot and the freelistlock.
-                 */
-                if (dqp->dq_flags & XFS_DQ_WANT) {
-                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
-                        trace_xfs_dqreclaim_want(dqp);
-                        xfs_dqunlock(dqp);
-                        xfs_qm_freelist_unlock(xfs_Gqm);
-                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
-                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                        goto startagain;
-                }
-                /*
-                 * If the dquot is inactive, we are assured that it is
-                 * not on the mplist or the hashlist, and that makes our
-                 * life easier.
-                 */
-                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-                        ASSERT(dqp->q_mount == NULL);
-                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                        ASSERT(dqp->HL_PREVP == NULL);
-                        ASSERT(dqp->MPL_PREVP == NULL);
-                        XQM_FREELIST_REMOVE(dqp);
-                        xfs_dqunlock(dqp);
-                        dqpout = dqp;
-                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                        break;
-                }
-                ASSERT(dqp->q_hash);
-                ASSERT(dqp->MPL_PREVP);
-                /*
-                 * Try to grab the flush lock. If this dquot is in the process of
-                 * getting flushed to disk, we don't want to reclaim it.
-                 */
-                if (!xfs_dqflock_nowait(dqp)) {
-                        xfs_dqunlock(dqp);
-                        continue;
-                }
-                /*
-                 * We have the flush lock so we know that this is not in the
-                 * process of being flushed. So, if this is dirty, flush it
-                 * DELWRI so that we don't get a freelist infested with
-                 * dirty dquots.
-                 */
-                if (XFS_DQ_IS_DIRTY(dqp)) {
-                        int     error;
-                        trace_xfs_dqreclaim_dirty(dqp);
-                        /*
-                         * We flush it delayed write, so don't bother
-                         * releasing the freelist lock.
-                         */
-                        error = xfs_qm_dqflush(dqp, 0);
-                        if (error) {
-                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
-                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
-                        }
-                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-                        continue;
-                }
-                if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
-                        xfs_dqfunlock(dqp);
-                        xfs_dqunlock(dqp);
-                        continue;
-                }
-                if (!mutex_trylock(&dqp->q_hash->qh_lock))
-                        goto mplistunlock;
-                trace_xfs_dqreclaim_unlink(dqp);
-                ASSERT(dqp->q_nrefs == 0);
-                XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
-                XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
-                XQM_FREELIST_REMOVE(dqp);
-                dqpout = dqp;
-                mutex_unlock(&dqp->q_hash->qh_lock);
- mplistunlock:
-                xfs_qm_mplist_unlock(dqp->q_mount);
-                xfs_dqfunlock(dqp);
-                xfs_dqunlock(dqp);
-                if (dqpout)
-                        break;
-        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
-        return dqpout;
-}
 /*------------------------------------------------------------------*/
 /*
@@ -2662,66 +2566,3 @@ xfs_qm_vop_create_dqattach(
        }
 }
-/* ------------- list stuff -----------------*/
-STATIC void
-xfs_qm_freelist_init(xfs_frlist_t *ql)
-{
-        ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
-        mutex_init(&ql->qh_lock);
-        ql->qh_version = 0;
-        ql->qh_nelems = 0;
-}
-STATIC void
-xfs_qm_freelist_destroy(xfs_frlist_t *ql)
-{
-        xfs_dquot_t     *dqp, *nextdqp;
-        mutex_lock(&ql->qh_lock);
-        for (dqp = ql->qh_next;
-             dqp != (xfs_dquot_t *)ql; ) {
-                xfs_dqlock(dqp);
-                nextdqp = dqp->dq_flnext;
-#ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
-#endif
-                XQM_FREELIST_REMOVE(dqp);
-                xfs_dqunlock(dqp);
-                xfs_qm_dqdestroy(dqp);
-                dqp = nextdqp;
-        }
-        mutex_unlock(&ql->qh_lock);
-        mutex_destroy(&ql->qh_lock);
-        ASSERT(ql->qh_nelems == 0);
-}
-STATIC void
-xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
-        dq->dq_flnext = ql->qh_next;
-        dq->dq_flprev = (xfs_dquot_t *)ql;
-        ql->qh_next = dq;
-        dq->dq_flnext->dq_flprev = dq;
-        xfs_Gqm->qm_dqfreelist.qh_nelems++;
-        xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-void
-xfs_qm_freelist_unlink(xfs_dquot_t *dq)
-{
-        xfs_dquot_t *next = dq->dq_flnext;
-        xfs_dquot_t *prev = dq->dq_flprev;
-        next->dq_flprev = prev;
-        prev->dq_flnext = next;
-        dq->dq_flnext = dq->dq_flprev = dq;
-        xfs_Gqm->qm_dqfreelist.qh_nelems--;
-        xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-void
-xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
-        xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
-}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 495564b8af38..c9446f1c726d 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -72,17 +72,6 @@ extern kmem_zone_t	*qm_dqtrxzone;
 #define XFS_QM_MAX_DQCLUSTER_LOGSZ      3
 typedef xfs_dqhash_t    xfs_dqlist_t;
-/*
- * The freelist head. The first two fields match the first two in the
- * xfs_dquot_t structure (in xfs_dqmarker_t)
- */
-typedef struct xfs_frlist {
-       struct xfs_dquot *qh_next;
-       struct xfs_dquot *qh_prev;
-       struct mutex      qh_lock;
-       uint              qh_version;
-       uint              qh_nelems;
-} xfs_frlist_t;
 /*
 * Quota Manager (global) structure. Lives only in core.
@@ -91,7 +80,9 @@ typedef struct xfs_qm {
        xfs_dqlist_t    *qm_usr_dqhtable;/* udquot hash table */
        xfs_dqlist_t    *qm_grp_dqhtable;/* gdquot hash table */
        uint             qm_dqhashmask;  /* # buckets in dq hashtab - 1 */
-        xfs_frlist_t     qm_dqfreelist;  /* freelist of dquots */
+        struct list_head qm_dqfrlist;    /* freelist of dquots */
+        struct mutex     qm_dqfrlist_lock;
+        int              qm_dqfrlist_cnt;
        atomic_t         qm_totaldquots; /* total incore dquots */
        uint             qm_nrefs;       /* file systems with quota on */
        int              qm_dqfree_ratio;/* ratio of free to inuse dquots */
@@ -106,7 +97,9 @@ typedef struct xfs_qm {
 typedef struct xfs_quotainfo {
        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
-        xfs_dqlist_t     qi_dqlist;      /* all dquots in filesys */
+        struct list_head qi_dqlist;      /* all dquots in filesys */
+        struct mutex     qi_dqlist_lock;
+        int              qi_dquots;
        int              qi_dqreclaims;  /* a change here indicates
                                            a removal in the dqlist */
        time_t           qi_btimelimit;  /* limit for blks timer */
@@ -175,10 +168,6 @@ extern int		xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
 extern int              xfs_qm_scall_quotaon(xfs_mount_t *, uint);
 extern int              xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
-/* list stuff */
-extern void             xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
-extern void             xfs_qm_freelist_unlink(xfs_dquot_t *);
 #ifdef DEBUG
 extern int              xfs_qm_internalqcheck(xfs_mount_t *);
 #else
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 83e7ea3e25fa..3d1fc79532e2 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -55,7 +55,7 @@ static int xqm_proc_show(struct seq_file *m, void *v)
                        ndquot,
                        xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
                        xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
-                        xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
+                        xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
        return 0;
 }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 5d0ee8d492db..26fa43140f2e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -79,6 +79,7 @@ xfs_qm_scall_quotaoff(
        xfs_mount_t             *mp,
        uint                    flags)
 {
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
        uint                    dqtype;
        int                     error;
        uint                    inactivate_flags;
@@ -102,11 +103,8 @@ xfs_qm_scall_quotaoff(
         * critical thing.
         * If quotaoff, then we must be dealing with the root filesystem.
         */
-        ASSERT(mp->m_quotainfo);
+        ASSERT(q);
-        if (mp->m_quotainfo)
+        mutex_lock(&q->qi_quotaofflock);
-                mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
-        ASSERT(mp->m_quotainfo);
        /*
         * If we're just turning off quota enforcement, change mp and go.
@@ -117,7 +115,7 @@ xfs_qm_scall_quotaoff(
                spin_lock(&mp->m_sb_lock);
                mp->m_sb.sb_qflags = mp->m_qflags;
                spin_unlock(&mp->m_sb_lock);
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                mutex_unlock(&q->qi_quotaofflock);
                /* XXX what to do if error ? Revert back to old vals incore ? */
                error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
@@ -150,10 +148,8 @@ xfs_qm_scall_quotaoff(
         * Nothing to do?  Don't complain. This happens when we're just
         * turning off quota enforcement.
         */
-        if ((mp->m_qflags & flags) == 0) {
+        if ((mp->m_qflags & flags) == 0)
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                goto out_unlock;
-                return (0);
-        }
        /*
         * Write the LI_QUOTAOFF log record, and do SB changes atomically,
@@ -162,7 +158,7 @@ xfs_qm_scall_quotaoff(
         */
        error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
        if (error)
-                goto out_error;
+                goto out_unlock;
        /*
         * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -204,7 +200,7 @@ xfs_qm_scall_quotaoff(
         * So, if we couldn't purge all the dquots from the filesystem,
         * we can't get rid of the incore data structures.
         */
-        while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF)))
+        while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
                delay(10 * nculprits);
        /*
@@ -222,7 +218,7 @@ xfs_qm_scall_quotaoff(
        if (error) {
                /* We're screwed now. Shutdown is the only option. */
                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-                goto out_error;
+                goto out_unlock;
        }
        /*
@@ -230,27 +226,26 @@ xfs_qm_scall_quotaoff(
         */
        if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
            ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                mutex_unlock(&q->qi_quotaofflock);
                xfs_qm_destroy_quotainfo(mp);
                return (0);
        }
        /*
-         * Release our quotainode references, and vn_purge them,
+         * Release our quotainode references if we don't need them anymore.
-         * if we don't need them anymore.
         */
-        if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
+        if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
-                IRELE(XFS_QI_UQIP(mp));
+                IRELE(q->qi_uquotaip);
-                XFS_QI_UQIP(mp) = NULL;
+                q->qi_uquotaip = NULL;
        }
-        if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) {
+        if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
-                IRELE(XFS_QI_GQIP(mp));
+                IRELE(q->qi_gquotaip);
-                XFS_QI_GQIP(mp) = NULL;
+                q->qi_gquotaip = NULL;
        }
-out_error:
-        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-        return (error);
+out_unlock:
+        mutex_unlock(&q->qi_quotaofflock);
+        return error;
 }
 int
@@ -379,9 +374,9 @@ xfs_qm_scall_quotaon(
        /*
         * Switch on quota enforcement in core.
         */
-        mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
+        mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
        mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
-        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+        mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
        return (0);
 }
@@ -392,11 +387,12 @@ xfs_qm_scall_quotaon(
 */
 int
 xfs_qm_scall_getqstat(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        fs_quota_stat_t *out)
+        struct fs_quota_stat    *out)
 {
-        xfs_inode_t     *uip, *gip;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        boolean_t       tempuqip, tempgqip;
+        struct xfs_inode        *uip, *gip;
+        boolean_t               tempuqip, tempgqip;
        uip = gip = NULL;
        tempuqip = tempgqip = B_FALSE;
@@ -415,9 +411,9 @@ xfs_qm_scall_getqstat(
        out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
        out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
-        if (mp->m_quotainfo) {
+        if (q) {
-                uip = mp->m_quotainfo->qi_uquotaip;
+                uip = q->qi_uquotaip;
-                gip = mp->m_quotainfo->qi_gquotaip;
+                gip = q->qi_gquotaip;
        }
        if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
@@ -441,15 +437,15 @@ xfs_qm_scall_getqstat(
                if (tempgqip)
                        IRELE(gip);
        }
-        if (mp->m_quotainfo) {
+        if (q) {
-                out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
+                out->qs_incoredqs = q->qi_dquots;
-                out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp);
+                out->qs_btimelimit = q->qi_btimelimit;
-                out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp);
+                out->qs_itimelimit = q->qi_itimelimit;
-                out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp);
+                out->qs_rtbtimelimit = q->qi_rtbtimelimit;
-                out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp);
+                out->qs_bwarnlimit = q->qi_bwarnlimit;
-                out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp);
+                out->qs_iwarnlimit = q->qi_iwarnlimit;
        }
-        return (0);
+        return 0;
 }
 /*
@@ -462,6 +458,7 @@ xfs_qm_scall_setqlim(
        uint                    type,
        fs_disk_quota_t         *newlim)
 {
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
        xfs_disk_dquot_t        *ddq;
        xfs_dquot_t             *dqp;
        xfs_trans_t             *tp;
@@ -485,7 +482,7 @@ xfs_qm_scall_setqlim(
         * a quotaoff from happening). (XXXThis doesn't currently happen
         * because we take the vfslock before calling xfs_qm_sysent).
         */
-        mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
+        mutex_lock(&q->qi_quotaofflock);
        /*
         * Get the dquot (locked), and join it to the transaction.
@@ -493,9 +490,8 @@ xfs_qm_scall_setqlim(
         */
        if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
                xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
                ASSERT(error != ENOENT);
-                return (error);
+                goto out_unlock;
        }
        xfs_trans_dqjoin(tp, dqp);
        ddq = &dqp->q_core;
@@ -513,8 +509,8 @@ xfs_qm_scall_setqlim(
                ddq->d_blk_hardlimit = cpu_to_be64(hard);
                ddq->d_blk_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        mp->m_quotainfo->qi_bhardlimit = hard;
+                        q->qi_bhardlimit = hard;
-                        mp->m_quotainfo->qi_bsoftlimit = soft;
+                        q->qi_bsoftlimit = soft;
                }
        } else {
                qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
@@ -529,8 +525,8 @@ xfs_qm_scall_setqlim(
                ddq->d_rtb_hardlimit = cpu_to_be64(hard);
                ddq->d_rtb_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        mp->m_quotainfo->qi_rtbhardlimit = hard;
+                        q->qi_rtbhardlimit = hard;
-                        mp->m_quotainfo->qi_rtbsoftlimit = soft;
+                        q->qi_rtbsoftlimit = soft;
                }
        } else {
                qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
@@ -546,8 +542,8 @@ xfs_qm_scall_setqlim(
                ddq->d_ino_hardlimit = cpu_to_be64(hard);
                ddq->d_ino_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        mp->m_quotainfo->qi_ihardlimit = hard;
+                        q->qi_ihardlimit = hard;
-                        mp->m_quotainfo->qi_isoftlimit = soft;
+                        q->qi_isoftlimit = soft;
                }
        } else {
                qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
@@ -572,23 +568,23 @@ xfs_qm_scall_setqlim(
                 * for warnings.
                 */
                if (newlim->d_fieldmask & FS_DQ_BTIMER) {
-                        mp->m_quotainfo->qi_btimelimit = newlim->d_btimer;
+                        q->qi_btimelimit = newlim->d_btimer;
                        ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
                }
                if (newlim->d_fieldmask & FS_DQ_ITIMER) {
-                        mp->m_quotainfo->qi_itimelimit = newlim->d_itimer;
+                        q->qi_itimelimit = newlim->d_itimer;
                        ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
                }
                if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
-                        mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer;
+                        q->qi_rtbtimelimit = newlim->d_rtbtimer;
                        ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
                }
                if (newlim->d_fieldmask & FS_DQ_BWARNS)
-                        mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns;
+                        q->qi_bwarnlimit = newlim->d_bwarns;
                if (newlim->d_fieldmask & FS_DQ_IWARNS)
-                        mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns;
+                        q->qi_iwarnlimit = newlim->d_iwarns;
                if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
-                        mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns;
+                        q->qi_rtbwarnlimit = newlim->d_rtbwarns;
        } else {
                /*
                 * If the user is now over quota, start the timelimit.
@@ -605,8 +601,9 @@ xfs_qm_scall_setqlim(
        error = xfs_trans_commit(tp, 0);
        xfs_qm_dqprint(dqp);
        xfs_qm_dqrele(dqp);
-        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+ out_unlock:
+        mutex_unlock(&q->qi_quotaofflock);
        return error;
 }
@@ -853,7 +850,8 @@ xfs_dqrele_inode(
        int                     error;
        /* skip quota inodes */
-        if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) {
+        if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
+            ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
                ASSERT(ip->i_udquot == NULL);
                ASSERT(ip->i_gdquot == NULL);
                read_unlock(&pag->pag_ici_lock);
@@ -891,7 +889,8 @@ xfs_qm_dqrele_all_inodes(
        uint             flags)
 {
        ASSERT(mp->m_quotainfo);
-        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
+        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
+                                XFS_ICI_NO_TAG, 0, NULL);
 }
 /*------------------------------------------------------------------------*/
@@ -930,7 +929,8 @@ struct mutex  qcheck_lock;
 }
 typedef struct dqtest {
-        xfs_dqmarker_t  q_lists;
+        uint             dq_flags;      /* various flags (XFS_DQ_*) */
+        struct list_head q_hashlist;
        xfs_dqhash_t    *q_hash;        /* the hashchain header */
        xfs_mount_t     *q_mount;       /* filesystem this relates to */
        xfs_dqid_t      d_id;           /* user id or group id */
@@ -941,14 +941,9 @@ typedef struct dqtest {
 STATIC void
 xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
 {
-        xfs_dquot_t *d;
+        list_add(&dqp->q_hashlist, &h->qh_list);
-        if (((d) = (h)->qh_next))
+        h->qh_version++;
-                (d)->HL_PREVP = &((dqp)->HL_NEXT);
+        h->qh_nelems++;
-        (dqp)->HL_NEXT = d;
-        (dqp)->HL_PREVP = &((h)->qh_next);
-        (h)->qh_next = (xfs_dquot_t *)dqp;
-        (h)->qh_version++;
-        (h)->qh_nelems++;
 }
 STATIC void
 xfs_qm_dqtest_print(
@@ -1060,9 +1055,7 @@ xfs_qm_internalqcheck_dqget(
        xfs_dqhash_t    *h;
        h = DQTEST_HASH(mp, id, type);
-        for (d = (xfs_dqtest_t *) h->qh_next; d != NULL;
+        list_for_each_entry(d, &h->qh_list, q_hashlist) {
-             d = (xfs_dqtest_t *) d->HL_NEXT) {
-                /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
                if (d->d_id == id && mp == d->q_mount) {
                        *O_dq = d;
                        return (0);
@@ -1073,6 +1066,7 @@ xfs_qm_internalqcheck_dqget(
        d->d_id = id;
        d->q_mount = mp;
        d->q_hash = h;
+        INIT_LIST_HEAD(&d->q_hashlist);
        xfs_qm_hashinsert(h, d);
        *O_dq = d;
        return (0);
@@ -1179,8 +1173,6 @@ xfs_qm_internalqcheck(
        xfs_ino_t       lastino;
        int             done, count;
        int             i;
-        xfs_dqtest_t    *d, *e;
-        xfs_dqhash_t    *h1;
        int             error;
        lastino = 0;
@@ -1220,19 +1212,18 @@ xfs_qm_internalqcheck(
        }
        cmn_err(CE_DEBUG, "Checking results against system dquots");
        for (i = 0; i < qmtest_hashmask; i++) {
-                h1 = &qmtest_udqtab[i];
+                xfs_dqtest_t    *d, *n;
-                for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+                xfs_dqhash_t    *h;
+                h = &qmtest_udqtab[i];
+                list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
                        xfs_dqtest_cmp(d);
-                        e = (xfs_dqtest_t *) d->HL_NEXT;
                        kmem_free(d);
-                        d = e;
                }
-                h1 = &qmtest_gdqtab[i];
+                h = &qmtest_gdqtab[i];
-                for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+                list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
                        xfs_dqtest_cmp(d);
-                        e = (xfs_dqtest_t *) d->HL_NEXT;
                        kmem_free(d);
-                        d = e;
                }
        }
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 8286b2842b6b..94a3d927d716 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -24,43 +24,6 @@
 */
 #define XFS_DQITER_MAP_SIZE     10
-/* Number of dquots that fit in to a dquot block */
-#define XFS_QM_DQPERBLK(mp)     ((mp)->m_quotainfo->qi_dqperchunk)
-#define XFS_DQ_IS_ADDEDTO_TRX(t, d)     ((d)->q_transp == (t))
-#define XFS_QI_MPLRECLAIMS(mp)  ((mp)->m_quotainfo->qi_dqreclaims)
-#define XFS_QI_UQIP(mp)         ((mp)->m_quotainfo->qi_uquotaip)
-#define XFS_QI_GQIP(mp)         ((mp)->m_quotainfo->qi_gquotaip)
-#define XFS_QI_DQCHUNKLEN(mp)   ((mp)->m_quotainfo->qi_dqchunklen)
-#define XFS_QI_BTIMELIMIT(mp)   ((mp)->m_quotainfo->qi_btimelimit)
-#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
-#define XFS_QI_ITIMELIMIT(mp)   ((mp)->m_quotainfo->qi_itimelimit)
-#define XFS_QI_BWARNLIMIT(mp)   ((mp)->m_quotainfo->qi_bwarnlimit)
-#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit)
-#define XFS_QI_IWARNLIMIT(mp)   ((mp)->m_quotainfo->qi_iwarnlimit)
-#define XFS_QI_QOFFLOCK(mp)     ((mp)->m_quotainfo->qi_quotaofflock)
-#define XFS_QI_MPL_LIST(mp)     ((mp)->m_quotainfo->qi_dqlist)
-#define XFS_QI_MPLNEXT(mp)      ((mp)->m_quotainfo->qi_dqlist.qh_next)
-#define XFS_QI_MPLNDQUOTS(mp)   ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
-#define xfs_qm_mplist_lock(mp) \
-        mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_mplist_nowait(mp) \
-        mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_mplist_unlock(mp) \
-        mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define XFS_QM_IS_MPLIST_LOCKED(mp) \
-        mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_freelist_lock(qm) \
-        mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
-#define xfs_qm_freelist_lock_nowait(qm) \
-        mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
-#define xfs_qm_freelist_unlock(qm) \
-        mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
 /*
 * Hash into a bucket in the dquot hash table, based on <mp, id>.
 */
@@ -72,9 +35,6 @@
                                      XFS_DQ_HASHVAL(mp, id)) : \
                                     (xfs_Gqm->qm_grp_dqhtable + \
                                      XFS_DQ_HASHVAL(mp, id)))
-#define XFS_IS_DQTYPE_ON(mp, type)   (type == XFS_DQ_USER ? \
-                                        XFS_IS_UQUOTA_ON(mp) : \
-                                        XFS_IS_OQUOTA_ON(mp))
 #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
        !dqp->q_core.d_blk_hardlimit && \
        !dqp->q_core.d_blk_softlimit && \
@@ -86,68 +46,6 @@
        !dqp->q_core.d_rtbcount && \
        !dqp->q_core.d_icount)
-#define HL_PREVP        dq_hashlist.ql_prevp
-#define HL_NEXT         dq_hashlist.ql_next
-#define MPL_PREVP       dq_mplist.ql_prevp
-#define MPL_NEXT        dq_mplist.ql_next
-#define _LIST_REMOVE(h, dqp, PVP, NXT)                          \
-        {                                                       \
-                 xfs_dquot_t *d;                                \
-                 if (((d) = (dqp)->NXT))                                \
-                         (d)->PVP = (dqp)->PVP;                 \
-                 *((dqp)->PVP) = d;                             \
-                 (dqp)->NXT = NULL;                             \
-                 (dqp)->PVP = NULL;                             \
-                 (h)->qh_version++;                             \
-                 (h)->qh_nelems--;                              \
-        }
-#define _LIST_INSERT(h, dqp, PVP, NXT)                          \
-        {                                                       \
-                 xfs_dquot_t *d;                                \
-                 if (((d) = (h)->qh_next))                      \
-                         (d)->PVP = &((dqp)->NXT);              \
-                 (dqp)->NXT = d;                                \
-                 (dqp)->PVP = &((h)->qh_next);                  \
-                 (h)->qh_next = dqp;                            \
-                 (h)->qh_version++;                             \
-                 (h)->qh_nelems++;                              \
-         }
-#define FOREACH_DQUOT_IN_MP(dqp, mp) \
-        for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
-#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist)   \
-for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
-     (dqp) = (dqp)->dq_flnext)
-#define XQM_HASHLIST_INSERT(h, dqp)     \
-         _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
-#define XQM_FREELIST_INSERT(h, dqp)     \
-         xfs_qm_freelist_append(h, dqp)
-#define XQM_MPLIST_INSERT(h, dqp)       \
-         _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
-#define XQM_HASHLIST_REMOVE(h, dqp)     \
-         _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
-#define XQM_FREELIST_REMOVE(dqp)        \
-         xfs_qm_freelist_unlink(dqp)
-#define XQM_MPLIST_REMOVE(h, dqp)       \
-        { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
-          XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
-#define XFS_DQ_IS_LOGITEM_INITD(dqp)    ((dqp)->q_logitem.qli_dquot == (dqp))
-#define XFS_QM_DQP_TO_DQACCT(tp, dqp)   (XFS_QM_ISUDQ(dqp) ? \
-                                         (tp)->t_dqinfo->dqa_usrdquots : \
-                                         (tp)->t_dqinfo->dqa_grpdquots)
-#define XFS_IS_SUSER_DQUOT(dqp)         \
-        (!((dqp)->q_core.d_id))
 #define DQFLAGTO_TYPESTR(d)     (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
                                 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
                                 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index c3ab75cb1d9a..061d827da33c 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -59,12 +59,11 @@ xfs_trans_dqjoin(
        xfs_trans_t     *tp,
        xfs_dquot_t     *dqp)
 {
-        xfs_dq_logitem_t    *lp;
+        xfs_dq_logitem_t    *lp = &dqp->q_logitem;
-        ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+        ASSERT(dqp->q_transp != tp);
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp));
+        ASSERT(lp->qli_dquot == dqp);
-        lp = &dqp->q_logitem;
        /*
         * Get a log_item_desc to point at the new item.
@@ -96,7 +95,7 @@ xfs_trans_log_dquot(
 {
        xfs_log_item_desc_t     *lidp;
-        ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+        ASSERT(dqp->q_transp == tp);
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
@@ -198,16 +197,16 @@ xfs_trans_get_dqtrx(
        int             i;
        xfs_dqtrx_t     *qa;
-        for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+        qa = XFS_QM_ISUDQ(dqp) ?
-                qa = XFS_QM_DQP_TO_DQACCT(tp, dqp);
+                tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+        for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
                if (qa[i].qt_dquot == NULL ||
-                    qa[i].qt_dquot == dqp) {
+                    qa[i].qt_dquot == dqp)
-                        return (&qa[i]);
+                        return &qa[i];
-                }
        }
-        return (NULL);
+        return NULL;
 }
 /*
@@ -381,7 +380,7 @@ xfs_trans_apply_dquot_deltas(
                                break;
                        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-                        ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+                        ASSERT(dqp->q_transp == tp);
                        /*
                         * adjust the actual number of blocks used
@@ -639,7 +638,7 @@ xfs_trans_dqresv(
                        softlimit = q->qi_bsoftlimit;
                timer = be32_to_cpu(dqp->q_core.d_btimer);
                warns = be16_to_cpu(dqp->q_core.d_bwarns);
-                warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount);
+                warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
                resbcountp = &dqp->q_res_bcount;
        } else {
                ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
@@ -651,7 +650,7 @@ xfs_trans_dqresv(
                        softlimit = q->qi_rtbsoftlimit;
                timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
                warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
-                warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount);
+                warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
                resbcountp = &dqp->q_res_rtbcount;
        }
@@ -691,7 +690,7 @@ xfs_trans_dqresv(
                        count = be64_to_cpu(dqp->q_core.d_icount);
                        timer = be32_to_cpu(dqp->q_core.d_itimer);
                        warns = be16_to_cpu(dqp->q_core.d_iwarns);
-                        warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount);
+                        warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
                        hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
                        if (!hardlimit)
                                hardlimit = q->qi_ihardlimit;
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index b1a5a1ff88ea..abb8222b88c9 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -223,6 +223,7 @@ typedef struct xfs_perag {
        int             pag_ici_init;   /* incore inode cache initialised */
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
+        int             pag_ici_reclaimable;    /* reclaimable inodes */
 #endif
        int             pagb_count;     /* pagb slots in use */
        xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5c11e4d17010..99587ded043f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3829,7 +3829,7 @@ xfs_bmap_add_attrfork(
        }
        if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
                goto error2;
-        error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES);
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
        ASSERT(ip->i_df.if_ext_max ==
               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
        return error;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f3c49e69eab9..240340a4727b 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -372,12 +372,12 @@ xfs_buf_item_pin(
 */
 STATIC void
 xfs_buf_item_unpin(
-        xfs_buf_log_item_t      *bip,
+        xfs_buf_log_item_t      *bip)
-        int                     stale)
 {
        struct xfs_ail  *ailp;
        xfs_buf_t       *bp;
        int             freed;
+        int             stale = bip->bli_flags & XFS_BLI_STALE;
        bp = bip->bli_buf;
        ASSERT(bp != NULL);
@@ -428,40 +428,34 @@ xfs_buf_item_unpin_remove(
        xfs_buf_log_item_t      *bip,
        xfs_trans_t             *tp)
 {
-        xfs_buf_t               *bp;
+        /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */
-        xfs_log_item_desc_t     *lidp;
-        int                     stale = 0;
-        bp = bip->bli_buf;
-        /*
-         * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
-         */
        if ((atomic_read(&bip->bli_refcount) == 1) &&
            (bip->bli_flags & XFS_BLI_STALE)) {
+                /*
+                 * yes -- We can safely do some work here and then call
+                 * buf_item_unpin to do the rest because we are
+                 * are holding the buffer locked so no one else will be
+                 * able to bump up the refcount. We have to remove the
+                 * log item from the transaction as we are about to release
+                 * our reference to the buffer. If we don't, the unlock that
+                 * occurs later in the xfs_trans_uncommit() will try to
+                 * reference the buffer which we no longer have a hold on.
+                 */
+                struct xfs_log_item_desc *lidp;
                ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
                trace_xfs_buf_item_unpin_stale(bip);
-                /*
+                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip);
-                 * yes -- clear the xaction descriptor in-use flag
-                 * and free the chunk if required.  We can safely
-                 * do some work here and then call buf_item_unpin
-                 * to do the rest because if the if is true, then
-                 * we are holding the buffer locked so no one else
-                 * will be able to bump up the refcount.
-                 */
-                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
-                stale = lidp->lid_flags & XFS_LID_BUF_STALE;
                xfs_trans_free_item(tp, lidp);
                /*
-                 * Since the transaction no longer refers to the buffer,
+                 * Since the transaction no longer refers to the buffer, the
-                 * the buffer should no longer refer to the transaction.
+                 * buffer should no longer refer to the transaction.
                 */
-                XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+                XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL);
        }
+        xfs_buf_item_unpin(bip);
-        xfs_buf_item_unpin(bip, stale);
-        return;
 }
 /*
@@ -675,7 +669,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_buf_item_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
                                        xfs_buf_item_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
@@ -733,10 +727,7 @@ xfs_buf_item_init(
        bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
                                                    KM_SLEEP);
-        bip->bli_item.li_type = XFS_LI_BUF;
+        xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
-        bip->bli_item.li_ops = &xfs_buf_item_ops;
-        bip->bli_item.li_mountp = mp;
-        bip->bli_item.li_ailp = mp->m_ail;
        bip->bli_buf = bp;
        xfs_buf_hold(bp);
        bip->bli_format.blf_type = XFS_LI_BUF;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 217f34af00cb..df4454511f73 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -26,7 +26,7 @@ extern kmem_zone_t	*xfs_buf_item_zone;
 * have been logged.
 * For 6.2 and beyond, this is XFS_LI_BUF.  We use this to log everything.
 */
-typedef struct xfs_buf_log_format_t {
+typedef struct xfs_buf_log_format {
        unsigned short  blf_type;       /* buf log item type indicator */
        unsigned short  blf_size;       /* size of this item */
        ushort          blf_flags;      /* misc state */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index cd27c9d6c71f..5bba29a07812 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -177,16 +177,26 @@ xfs_swap_extents_check_format(
            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
                return EINVAL;
-        /* Check root block of temp in btree form to max in target */
+        /*
+         * If we are in a btree format, check that the temp root block will fit
+         * in the target and that it has enough extents to be in btree format
+         * in the target.
+         *
+         * Note that we have to be careful to allow btree->extent conversions
+         * (a common defrag case) which will occur when the temp inode is in
+         * extent format...
+         */
        if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-            XFS_IFORK_BOFF(ip) &&
+            ((XFS_IFORK_BOFF(ip) &&
-            tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+              tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
+             XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
                return EINVAL;
-        /* Check root block of target in btree form to max in temp */
+        /* Reciprocal target->temp btree format checks */
        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-            XFS_IFORK_BOFF(tip) &&
+            ((XFS_IFORK_BOFF(tip) &&
-            ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+              ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
+             XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
                return EINVAL;
        return 0;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 92d5cd5bf4f2..ef96175c0744 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -186,18 +186,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
 void
 xfs_error_report(
-        char            *tag,
+        const char              *tag,
-        int             level,
+        int                     level,
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        char            *fname,
+        const char              *filename,
-        int             linenum,
+        int                     linenum,
-        inst_t          *ra)
+        inst_t                  *ra)
 {
        if (level <= xfs_error_level) {
                xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
                            CE_ALERT, mp,
                "XFS internal error %s at line %d of file %s.  Caller 0x%p\n",
-                            tag, linenum, fname, ra);
+                            tag, linenum, filename, ra);
                xfs_stack_trace();
        }
@@ -205,15 +205,15 @@ xfs_error_report(
 void
 xfs_corruption_error(
-        char            *tag,
+        const char              *tag,
-        int             level,
+        int                     level,
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        void            *p,
+        void                    *p,
-        char            *fname,
+        const char              *filename,
-        int             linenum,
+        int                     linenum,
-        inst_t          *ra)
+        inst_t                  *ra)
 {
        if (level <= xfs_error_level)
                xfs_hex_dump(p, 16);
-        xfs_error_report(tag, level, mp, fname, linenum, ra);
+        xfs_error_report(tag, level, mp, filename, linenum, ra);
 }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 0c93051c4651..c2c1a072bb82 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -29,10 +29,11 @@ extern int	xfs_error_trap(int);
 struct xfs_mount;
-extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp,
+extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
-                                char *fname, int linenum, inst_t *ra);
+                        const char *filename, int linenum, inst_t *ra);
-extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
+extern void xfs_corruption_error(const char *tag, int level,
-                                void *p, char *fname, int linenum, inst_t *ra);
+                        struct xfs_mount *mp, void *p, const char *filename,
+                        int linenum, inst_t *ra);
 #define XFS_ERROR_REPORT(e, lvl, mp)    \
        xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6f35ed1b39b9..409fe81585fd 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -106,7 +106,7 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
 */
 /*ARGSUSED*/
 STATIC void
-xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
+xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
 {
        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
@@ -224,7 +224,7 @@ static struct xfs_item_ops xfs_efi_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_efi_item_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
                                        xfs_efi_item_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
@@ -259,10 +259,7 @@ xfs_efi_init(xfs_mount_t	*mp,
                                                             KM_SLEEP);
        }
-        efip->efi_item.li_type = XFS_LI_EFI;
+        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
-        efip->efi_item.li_ops = &xfs_efi_item_ops;
-        efip->efi_item.li_mountp = mp;
-        efip->efi_item.li_ailp = mp->m_ail;
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
@@ -428,7 +425,7 @@ xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
 */
 /*ARGSUSED*/
 STATIC void
-xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale)
+xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
 {
        return;
 }
@@ -518,7 +515,7 @@ static struct xfs_item_ops xfs_efd_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_efd_item_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
                                        xfs_efd_item_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
@@ -554,10 +551,7 @@ xfs_efd_init(xfs_mount_t	*mp,
                                                             KM_SLEEP);
        }
-        efdp->efd_item.li_type = XFS_LI_EFD;
+        xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
-        efdp->efd_item.li_ops = &xfs_efd_item_ops;
-        efdp->efd_item.li_mountp = mp;
-        efdp->efd_item.li_ailp = mp->m_ail;
        efdp->efd_efip = efip;
        efdp->efd_format.efd_nextents = nextents;
        efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0ffd56447045..8cd6e8d8fe9c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2449,6 +2449,8 @@ xfs_iunpin_nowait(
 {
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+        trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
        /* Give the log a push to start the unpinning I/O */
        xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7bfea8540159..cf8249a60004 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -543,6 +543,7 @@ xfs_inode_item_pin(
 {
        ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
+        trace_xfs_inode_pin(iip->ili_inode, _RET_IP_);
        atomic_inc(&iip->ili_inode->i_pincount);
 }
@@ -556,11 +557,11 @@ xfs_inode_item_pin(
 /* ARGSUSED */
 STATIC void
 xfs_inode_item_unpin(
-        xfs_inode_log_item_t    *iip,
+        xfs_inode_log_item_t    *iip)
-        int                     stale)
 {
        struct xfs_inode        *ip = iip->ili_inode;
+        trace_xfs_inode_unpin(ip, _RET_IP_);
        ASSERT(atomic_read(&ip->i_pincount) > 0);
        if (atomic_dec_and_test(&ip->i_pincount))
                wake_up(&ip->i_ipin_wait);
@@ -572,7 +573,7 @@ xfs_inode_item_unpin_remove(
        xfs_inode_log_item_t    *iip,
        xfs_trans_t             *tp)
 {
-        xfs_inode_item_unpin(iip, 0);
+        xfs_inode_item_unpin(iip);
 }
 /*
@@ -838,7 +839,7 @@ static struct xfs_item_ops xfs_inode_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_inode_item_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
                                        xfs_inode_item_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
@@ -865,17 +866,9 @@ xfs_inode_item_init(
        ASSERT(ip->i_itemp == NULL);
        iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
-        iip->ili_item.li_type = XFS_LI_INODE;
-        iip->ili_item.li_ops = &xfs_inode_item_ops;
-        iip->ili_item.li_mountp = mp;
-        iip->ili_item.li_ailp = mp->m_ail;
        iip->ili_inode = ip;
+        xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
-        /*
+                                                &xfs_inode_item_ops);
-           We have zeroed memory. No need ...
-           iip->ili_extents_buf = NULL;
-         */
        iip->ili_format.ilf_type = XFS_LI_INODE;
        iip->ili_format.ilf_ino = ip->i_ino;
        iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0b65039951a0..ef14943829da 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -55,71 +55,33 @@
 #define XFS_STRAT_WRITE_IMAPS   2
 #define XFS_WRITE_IMAPS         XFS_BMAP_MAX_NMAP
-STATIC int
+STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-xfs_imap_to_bmap(
+                                  int, struct xfs_bmbt_irec *, int *);
-        xfs_inode_t     *ip,
+STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-        xfs_off_t       offset,
+                                 struct xfs_bmbt_irec *, int *);
-        xfs_bmbt_irec_t *imap,
+STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-        xfs_iomap_t     *iomapp,
+                                struct xfs_bmbt_irec *, int *);
-        int             imaps,                  /* Number of imap entries */
-        int             iomaps,                 /* Number of iomap entries */
-        int             flags)
-{
-        xfs_mount_t     *mp = ip->i_mount;
-        int             pbm;
-        xfs_fsblock_t   start_block;
-        for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
-                iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-                iomapp->iomap_delta = offset - iomapp->iomap_offset;
-                iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
-                iomapp->iomap_flags = flags;
-                if (XFS_IS_REALTIME_INODE(ip)) {
-                        iomapp->iomap_flags |= IOMAP_REALTIME;
-                        iomapp->iomap_target = mp->m_rtdev_targp;
-                } else {
-                        iomapp->iomap_target = mp->m_ddev_targp;
-                }
-                start_block = imap->br_startblock;
-                if (start_block == HOLESTARTBLOCK) {
-                        iomapp->iomap_bn = IOMAP_DADDR_NULL;
-                        iomapp->iomap_flags |= IOMAP_HOLE;
-                } else if (start_block == DELAYSTARTBLOCK) {
-                        iomapp->iomap_bn = IOMAP_DADDR_NULL;
-                        iomapp->iomap_flags |= IOMAP_DELAY;
-                } else {
-                        iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
-                        if (ISUNWRITTEN(imap))
-                                iomapp->iomap_flags |= IOMAP_UNWRITTEN;
-                }
-                offset += iomapp->iomap_bsize - iomapp->iomap_delta;
-        }
-        return pbm;     /* Return the number filled */
-}
 int
 xfs_iomap(
-        xfs_inode_t     *ip,
+        struct xfs_inode        *ip,
-        xfs_off_t       offset,
+        xfs_off_t               offset,
-        ssize_t         count,
+        ssize_t                 count,
-        int             flags,
+        int                     flags,
-        xfs_iomap_t     *iomapp,
+        struct xfs_bmbt_irec    *imap,
-        int             *niomaps)
+        int                     *nimaps,
+        int                     *new)
 {
-        xfs_mount_t     *mp = ip->i_mount;
+        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t   offset_fsb, end_fsb;
+        xfs_fileoff_t           offset_fsb, end_fsb;
-        int             error = 0;
+        int                     error = 0;
-        int             lockmode = 0;
+        int                     lockmode = 0;
-        xfs_bmbt_irec_t imap;
+        int                     bmapi_flags = 0;
-        int             nimaps = 1;
-        int             bmapi_flags = 0;
-        int             iomap_flags = 0;
        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
+        *new = 0;
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
@@ -160,8 +122,8 @@ xfs_iomap(
        error = xfs_bmapi(NULL, ip, offset_fsb,
                        (xfs_filblks_t)(end_fsb - offset_fsb),
-                        bmapi_flags,  NULL, 0, &imap,
+                        bmapi_flags,  NULL, 0, imap,
-                        &nimaps, NULL, NULL);
+                        nimaps, NULL, NULL);
        if (error)
                goto out;
@@ -169,46 +131,41 @@ xfs_iomap(
        switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
        case BMAPI_WRITE:
                /* If we found an extent, return it */
-                if (nimaps &&
+                if (*nimaps &&
-                    (imap.br_startblock != HOLESTARTBLOCK) &&
+                    (imap->br_startblock != HOLESTARTBLOCK) &&
-                    (imap.br_startblock != DELAYSTARTBLOCK)) {
+                    (imap->br_startblock != DELAYSTARTBLOCK)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, &imap);
+                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
                        break;
                }
                if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
                        error = xfs_iomap_write_direct(ip, offset, count, flags,
-                                                       &imap, &nimaps, nimaps);
+                                                       imap, nimaps);
                } else {
                        error = xfs_iomap_write_delay(ip, offset, count, flags,
-                                                      &imap, &nimaps);
+                                                      imap, nimaps);
                }
                if (!error) {
-                        trace_xfs_iomap_alloc(ip, offset, count, flags, &imap);
+                        trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
                }
-                iomap_flags = IOMAP_NEW;
+                *new = 1;
                break;
        case BMAPI_ALLOCATE:
                /* If we found an extent, return it */
                xfs_iunlock(ip, lockmode);
                lockmode = 0;
-                if (nimaps && !isnullstartblock(imap.br_startblock)) {
+                if (*nimaps && !isnullstartblock(imap->br_startblock)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, &imap);
+                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
                        break;
                }
                error = xfs_iomap_write_allocate(ip, offset, count,
-                                                 &imap, &nimaps);
+                                                 imap, nimaps);
                break;
        }
-        if (nimaps) {
+        ASSERT(*nimaps <= 1);
-                *niomaps = xfs_imap_to_bmap(ip, offset, &imap,
-                                            iomapp, nimaps, *niomaps, iomap_flags);
-        } else if (niomaps) {
-                *niomaps = 0;
-        }
 out:
        if (lockmode)
@@ -216,7 +173,6 @@ out:
        return XFS_ERROR(error);
 }
 STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
@@ -285,15 +241,14 @@ xfs_cmn_err_fsblock_zero(
        return EFSCORRUPTED;
 }
-int
+STATIC int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
        int             flags,
        xfs_bmbt_irec_t *ret_imap,
-        int             *nmaps,
+        int             *nmaps)
-        int             found)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -330,7 +285,7 @@ xfs_iomap_write_direct(
                if (error)
                        goto error_out;
        } else {
-                if (found && (ret_imap->br_startblock == HOLESTARTBLOCK))
+                if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
                                        ret_imap->br_blockcount +
                                        ret_imap->br_startoff);
@@ -485,7 +440,7 @@ xfs_iomap_eof_want_preallocate(
        return 0;
 }
-int
+STATIC int
 xfs_iomap_write_delay(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
@@ -588,7 +543,7 @@ retry:
 * We no longer bother to look at the incoming map - all we have to
 * guarantee is that whatever we allocate fills the required range.
 */
-int
+STATIC int
 xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 174f29990991..81ac4afd45b3 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,19 +18,6 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
-#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL))
-typedef enum {                          /* iomap_flags values */
-        IOMAP_READ =            0,      /* mapping for a read */
-        IOMAP_HOLE =            0x02,   /* mapping covers a hole  */
-        IOMAP_DELAY =           0x04,   /* mapping covers delalloc region  */
-        IOMAP_REALTIME =        0x10,   /* mapping on the realtime device  */
-        IOMAP_UNWRITTEN =       0x20,   /* mapping covers allocated */
-                                        /* but uninitialized file data  */
-        IOMAP_NEW =             0x40    /* just allocate */
-} iomap_flags_t;
 typedef enum {
        /* base extent manipulation calls */
        BMAPI_READ = (1 << 0),          /* read extents */
@@ -52,43 +39,11 @@ typedef enum {
        { BMAPI_MMAP,           "MMAP" }, \
        { BMAPI_TRYLOCK,        "TRYLOCK" }
-/*
- * xfs_iomap_t:  File system I/O map
- *
- * The iomap_bn field is expressed in 512-byte blocks, and is where the
- * mapping starts on disk.
- *
- * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes.
- * iomap_offset is the offset of the mapping in the file itself.
- * iomap_bsize is the size of the mapping,  iomap_delta is the
- * desired data's offset into the mapping, given the offset supplied
- * to the file I/O map routine.
- *
- * When a request is made to read beyond the logical end of the object,
- * iomap_size may be set to 0, but iomap_offset and iomap_length should be set
- * to the actual amount of underlying storage that has been allocated, if any.
- */
-typedef struct xfs_iomap {
-        xfs_daddr_t             iomap_bn;       /* first 512B blk of mapping */
-        xfs_buftarg_t           *iomap_target;
-        xfs_off_t               iomap_offset;   /* offset of mapping, bytes */
-        xfs_off_t               iomap_bsize;    /* size of mapping, bytes */
-        xfs_off_t               iomap_delta;    /* offset into mapping, bytes */
-        iomap_flags_t           iomap_flags;
-} xfs_iomap_t;
 struct xfs_inode;
 struct xfs_bmbt_irec;
 extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
-                     struct xfs_iomap *, int *);
+                     struct xfs_bmbt_irec *, int *, int *);
-extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                                  int, struct xfs_bmbt_irec *, int *, int);
-extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-                                 struct xfs_bmbt_irec *, int *);
-extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-                                struct xfs_bmbt_irec *, int *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e8fba92d7cd9..3038dd52c72a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -44,13 +44,8 @@
 kmem_zone_t     *xfs_log_ticket_zone;
-#define xlog_write_adv_cnt(ptr, len, off, bytes) \
-        { (ptr) += (bytes); \
-          (len) -= (bytes); \
-          (off) += (bytes);}
 /* Local miscellaneous function prototypes */
-STATIC int       xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
+STATIC int       xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
                                    xlog_in_core_t **, xfs_lsn_t *);
 STATIC xlog_t *  xlog_alloc_log(xfs_mount_t     *mp,
                                xfs_buftarg_t   *log_target,
@@ -59,11 +54,9 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
 STATIC int       xlog_space_left(xlog_t *log, int cycle, int bytes);
 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void      xlog_dealloc_log(xlog_t *log);
-STATIC int       xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
+STATIC int       xlog_write(struct log *log, struct xfs_log_vec *log_vector,
-                            int nentries, struct xlog_ticket *tic,
+                            struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
-                            xfs_lsn_t *start_lsn,
+                            xlog_in_core_t **commit_iclog, uint flags);
-                            xlog_in_core_t **commit_iclog,
-                            uint flags);
 /* local state machine functions */
 STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -102,7 +95,7 @@ STATIC xlog_ticket_t	*xlog_ticket_alloc(xlog_t *log,
                                         uint   flags);
 #if defined(DEBUG)
-STATIC void     xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
+STATIC void     xlog_verify_dest_ptr(xlog_t *log, char *ptr);
 STATIC void     xlog_verify_grant_head(xlog_t *log, int equals);
 STATIC void     xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
                                  int count, boolean_t syncing);
@@ -258,7 +251,7 @@ xfs_log_done(
             * If we get an error, just continue and give back the log ticket.
             */
            (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
-             (xlog_commit_record(mp, ticket, iclog, &lsn)))) {
+             (xlog_commit_record(log, ticket, iclog, &lsn)))) {
                lsn = (xfs_lsn_t) -1;
                if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
                        flags |= XFS_LOG_REL_PERM_RESERV;
@@ -516,18 +509,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 #ifdef DEBUG
        xlog_in_core_t   *first_iclog;
 #endif
-        xfs_log_iovec_t  reg[1];
        xlog_ticket_t   *tic = NULL;
        xfs_lsn_t        lsn;
        int              error;
-        /* the data section must be 32 bit size aligned */
-        struct {
-            __uint16_t magic;
-            __uint16_t pad1;
-            __uint32_t pad2; /* may as well make it 64 bits */
-        } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
        /*
         * Don't write out unmount record on read-only mounts.
         * Or, if we are doing a forced umount (typically because of IO errors).
@@ -549,16 +534,30 @@ xfs_log_unmount_write(xfs_mount_t *mp)
        } while (iclog != first_iclog);
 #endif
        if (! (XLOG_FORCED_SHUTDOWN(log))) {
-                reg[0].i_addr = (void*)&magic;
-                reg[0].i_len  = sizeof(magic);
-                reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
                error = xfs_log_reserve(mp, 600, 1, &tic,
                                        XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
                if (!error) {
+                        /* the data section must be 32 bit size aligned */
+                        struct {
+                            __uint16_t magic;
+                            __uint16_t pad1;
+                            __uint32_t pad2; /* may as well make it 64 bits */
+                        } magic = {
+                                .magic = XLOG_UNMOUNT_TYPE,
+                        };
+                        struct xfs_log_iovec reg = {
+                                .i_addr = (void *)&magic,
+                                .i_len = sizeof(magic),
+                                .i_type = XLOG_REG_TYPE_UNMOUNT,
+                        };
+                        struct xfs_log_vec vec = {
+                                .lv_niovecs = 1,
+                                .lv_iovecp = &reg,
+                        };
                        /* remove inited flag */
-                        ((xlog_ticket_t *)tic)->t_flags = 0;
+                        tic->t_flags = 0;
-                        error = xlog_write(mp, reg, 1, tic, &lsn,
+                        error = xlog_write(log, &vec, tic, &lsn,
                                           NULL, XLOG_UNMOUNT_TRANS);
                        /*
                         * At this point, we're umounting anyway,
@@ -648,10 +647,26 @@ xfs_log_unmount(xfs_mount_t *mp)
        xlog_dealloc_log(mp->m_log);
 }
+void
+xfs_log_item_init(
+        struct xfs_mount        *mp,
+        struct xfs_log_item     *item,
+        int                     type,
+        struct xfs_item_ops     *ops)
+{
+        item->li_mountp = mp;
+        item->li_ailp = mp->m_ail;
+        item->li_type = type;
+        item->li_ops = ops;
+}
 /*
 * Write region vectors to log.  The write happens using the space reservation
 * of the ticket (tic).  It is not a requirement that all writes for a given
- * transaction occur with one call to xfs_log_write().
+ * transaction occur with one call to xfs_log_write(). However, it is important
+ * to note that the transaction reservation code makes an assumption about the
+ * number of log headers a transaction requires that may be violated if you
+ * don't pass all the transaction vectors in one call....
 */
 int
 xfs_log_write(
@@ -663,11 +678,15 @@ xfs_log_write(
 {
        struct log              *log = mp->m_log;
        int                     error;
+        struct xfs_log_vec      vec = {
+                .lv_niovecs = nentries,
+                .lv_iovecp = reg,
+        };
        if (XLOG_FORCED_SHUTDOWN(log))
                return XFS_ERROR(EIO);
-        error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0);
+        error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
        if (error)
                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
        return error;
@@ -745,9 +764,16 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 /*
 * Determine if we have a transaction that has gone to disk
- * that needs to be covered. Log activity needs to be idle (no AIL and
+ * that needs to be covered. To begin the transition to the idle state
- * nothing in the iclogs). And, we need to be in the right state indicating
+ * firstly the log needs to be idle (no AIL and nothing in the iclogs).
- * something has gone out.
+ * If we are then in a state where covering is needed, the caller is informed
+ * that dummy transactions are required to move the log into the idle state.
+ *
+ * Because this is called as part of the sync process, we should also indicate
+ * that dummy transactions should be issued in anything but the covered or
+ * idle states. This ensures that the log tail is accurately reflected in
+ * the log at the end of the sync, hence if a crash occurrs avoids replay
+ * of transactions where the metadata is already on disk.
 */
 int
 xfs_log_need_covered(xfs_mount_t *mp)
@@ -759,17 +785,24 @@ xfs_log_need_covered(xfs_mount_t *mp)
                return 0;
        spin_lock(&log->l_icloglock);
-        if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
+        switch (log->l_covered_state) {
-                (log->l_covered_state == XLOG_STATE_COVER_NEED2))
+        case XLOG_STATE_COVER_DONE:
-                        && !xfs_trans_ail_tail(log->l_ailp)
+        case XLOG_STATE_COVER_DONE2:
-                        && xlog_iclogs_empty(log)) {
+        case XLOG_STATE_COVER_IDLE:
-                if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+                break;
-                        log->l_covered_state = XLOG_STATE_COVER_DONE;
+        case XLOG_STATE_COVER_NEED:
-                else {
+        case XLOG_STATE_COVER_NEED2:
-                        ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2);
+                if (!xfs_trans_ail_tail(log->l_ailp) &&
-                        log->l_covered_state = XLOG_STATE_COVER_DONE2;
+                    xlog_iclogs_empty(log)) {
+                        if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+                                log->l_covered_state = XLOG_STATE_COVER_DONE;
+                        else
+                                log->l_covered_state = XLOG_STATE_COVER_DONE2;
                }
+                /* FALLTHRU */
+        default:
                needed = 1;
+                break;
        }
        spin_unlock(&log->l_icloglock);
        return needed;
@@ -1006,6 +1039,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        int                     i;
        int                     iclogsize;
        int                     error = ENOMEM;
+        uint                    log2_size = 0;
        log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
        if (!log) {
@@ -1031,29 +1065,30 @@ xlog_alloc_log(xfs_mount_t	*mp,
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
-                log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
+                log2_size = mp->m_sb.sb_logsectlog;
-                if (log->l_sectbb_log < 0 ||
+                if (log2_size < BBSHIFT) {
-                    log->l_sectbb_log > mp->m_sectbb_log) {
+                        xlog_warn("XFS: Log sector size too small "
-                        xlog_warn("XFS: Log sector size (0x%x) out of range.",
+                                "(0x%x < 0x%x)", log2_size, BBSHIFT);
-                                                log->l_sectbb_log);
                        goto out_free_log;
                }
-                /* for larger sector sizes, must have v2 or external log */
+                log2_size -= BBSHIFT;
-                if (log->l_sectbb_log != 0 &&
+                if (log2_size > mp->m_sectbb_log) {
-                    (log->l_logBBstart != 0 &&
+                        xlog_warn("XFS: Log sector size too large "
-                     !xfs_sb_version_haslogv2(&mp->m_sb))) {
+                                "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
-                        xlog_warn("XFS: log sector size (0x%x) invalid "
-                                  "for configuration.", log->l_sectbb_log);
                        goto out_free_log;
                }
-                if (mp->m_sb.sb_logsectlog < BBSHIFT) {
-                        xlog_warn("XFS: Log sector log (0x%x) too small.",
+                /* for larger sector sizes, must have v2 or external log */
-                                                mp->m_sb.sb_logsectlog);
+                if (log2_size && log->l_logBBstart > 0 &&
+                            !xfs_sb_version_haslogv2(&mp->m_sb)) {
+                        xlog_warn("XFS: log sector size (0x%x) invalid "
+                                  "for configuration.", log2_size);
                        goto out_free_log;
                }
        }
-        log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
+        log->l_sectBBsize = 1 << log2_size;
        xlog_get_iclog_buffer_size(mp, log);
@@ -1160,26 +1195,31 @@ out:
 * ticket.  Return the lsn of the commit record.
 */
 STATIC int
-xlog_commit_record(xfs_mount_t  *mp,
+xlog_commit_record(
-                   xlog_ticket_t *ticket,
+        struct log              *log,
-                   xlog_in_core_t **iclog,
+        struct xlog_ticket      *ticket,
-                   xfs_lsn_t    *commitlsnp)
+        struct xlog_in_core     **iclog,
+        xfs_lsn_t               *commitlsnp)
 {
-        int             error;
+        struct xfs_mount *mp = log->l_mp;
-        xfs_log_iovec_t reg[1];
+        int     error;
+        struct xfs_log_iovec reg = {
-        reg[0].i_addr = NULL;
+                .i_addr = NULL,
-        reg[0].i_len = 0;
+                .i_len = 0,
-        reg[0].i_type = XLOG_REG_TYPE_COMMIT;
+                .i_type = XLOG_REG_TYPE_COMMIT,
+        };
+        struct xfs_log_vec vec = {
+                .lv_niovecs = 1,
+                .lv_iovecp = &reg,
+        };
        ASSERT_ALWAYS(iclog);
-        if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
+        error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
-                               iclog, XLOG_COMMIT_TRANS))) {
+                                        XLOG_COMMIT_TRANS);
+        if (error)
                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
-        }
        return error;
-}       /* xlog_commit_record */
+}
 /*
 * Push on the buffer cache code if we ever use more than 75% of the on-disk
@@ -1600,6 +1640,192 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
 }
 /*
+ * Calculate the potential space needed by the log vector.  Each region gets
+ * its own xlog_op_header_t and may need to be double word aligned.
+ */
+static int
+xlog_write_calc_vec_length(
+        struct xlog_ticket      *ticket,
+        struct xfs_log_vec      *log_vector)
+{
+        struct xfs_log_vec      *lv;
+        int                     headers = 0;
+        int                     len = 0;
+        int                     i;
+        /* acct for start rec of xact */
+        if (ticket->t_flags & XLOG_TIC_INITED)
+                headers++;
+        for (lv = log_vector; lv; lv = lv->lv_next) {
+                headers += lv->lv_niovecs;
+                for (i = 0; i < lv->lv_niovecs; i++) {
+                        struct xfs_log_iovec    *vecp = &lv->lv_iovecp[i];
+                        len += vecp->i_len;
+                        xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
+                }
+        }
+        ticket->t_res_num_ophdrs += headers;
+        len += headers * sizeof(struct xlog_op_header);
+        return len;
+}
+/*
+ * If first write for transaction, insert start record  We can't be trying to
+ * commit if we are inited.  We can't have any "partial_copy" if we are inited.
+ */
+static int
+xlog_write_start_rec(
+        struct xlog_op_header   *ophdr,
+        struct xlog_ticket      *ticket)
+{
+        if (!(ticket->t_flags & XLOG_TIC_INITED))
+                return 0;
+        ophdr->oh_tid   = cpu_to_be32(ticket->t_tid);
+        ophdr->oh_clientid = ticket->t_clientid;
+        ophdr->oh_len = 0;
+        ophdr->oh_flags = XLOG_START_TRANS;
+        ophdr->oh_res2 = 0;
+        ticket->t_flags &= ~XLOG_TIC_INITED;
+        return sizeof(struct xlog_op_header);
+}
+static xlog_op_header_t *
+xlog_write_setup_ophdr(
+        struct log              *log,
+        struct xlog_op_header   *ophdr,
+        struct xlog_ticket      *ticket,
+        uint                    flags)
+{
+        ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+        ophdr->oh_clientid = ticket->t_clientid;
+        ophdr->oh_res2 = 0;
+        /* are we copying a commit or unmount record? */
+        ophdr->oh_flags = flags;
+        /*
+         * We've seen logs corrupted with bad transaction client ids.  This
+         * makes sure that XFS doesn't generate them on.  Turn this into an EIO
+         * and shut down the filesystem.
+         */
+        switch (ophdr->oh_clientid)  {
+        case XFS_TRANSACTION:
+        case XFS_VOLUME:
+        case XFS_LOG:
+                break;
+        default:
+                xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                        "Bad XFS transaction clientid 0x%x in ticket 0x%p",
+                        ophdr->oh_clientid, ticket);
+                return NULL;
+        }
+        return ophdr;
+}
+/*
+ * Set up the parameters of the region copy into the log. This has
+ * to handle region write split across multiple log buffers - this
+ * state is kept external to this function so that this code can
+ * can be written in an obvious, self documenting manner.
+ */
+static int
+xlog_write_setup_copy(
+        struct xlog_ticket      *ticket,
+        struct xlog_op_header   *ophdr,
+        int                     space_available,
+        int                     space_required,
+        int                     *copy_off,
+        int                     *copy_len,
+        int                     *last_was_partial_copy,
+        int                     *bytes_consumed)
+{
+        int                     still_to_copy;
+        still_to_copy = space_required - *bytes_consumed;
+        *copy_off = *bytes_consumed;
+        if (still_to_copy <= space_available) {
+                /* write of region completes here */
+                *copy_len = still_to_copy;
+                ophdr->oh_len = cpu_to_be32(*copy_len);
+                if (*last_was_partial_copy)
+                        ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
+                *last_was_partial_copy = 0;
+                *bytes_consumed = 0;
+                return 0;
+        }
+        /* partial write of region, needs extra log op header reservation */
+        *copy_len = space_available;
+        ophdr->oh_len = cpu_to_be32(*copy_len);
+        ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+        if (*last_was_partial_copy)
+                ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
+        *bytes_consumed += *copy_len;
+        (*last_was_partial_copy)++;
+        /* account for new log op header */
+        ticket->t_curr_res -= sizeof(struct xlog_op_header);
+        ticket->t_res_num_ophdrs++;
+        return sizeof(struct xlog_op_header);
+}
+static int
+xlog_write_copy_finish(
+        struct log              *log,
+        struct xlog_in_core     *iclog,
+        uint                    flags,
+        int                     *record_cnt,
+        int                     *data_cnt,
+        int                     *partial_copy,
+        int                     *partial_copy_len,
+        int                     log_offset,
+        struct xlog_in_core     **commit_iclog)
+{
+        if (*partial_copy) {
+                /*
+                 * This iclog has already been marked WANT_SYNC by
+                 * xlog_state_get_iclog_space.
+                 */
+                xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+                *record_cnt = 0;
+                *data_cnt = 0;
+                return xlog_state_release_iclog(log, iclog);
+        }
+        *partial_copy = 0;
+        *partial_copy_len = 0;
+        if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
+                /* no more space in this iclog - push it. */
+                xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+                *record_cnt = 0;
+                *data_cnt = 0;
+                spin_lock(&log->l_icloglock);
+                xlog_state_want_sync(log, iclog);
+                spin_unlock(&log->l_icloglock);
+                if (!commit_iclog)
+                        return xlog_state_release_iclog(log, iclog);
+                ASSERT(flags & XLOG_COMMIT_TRANS);
+                *commit_iclog = iclog;
+        }
+        return 0;
+}
+/*
 * Write some region out to in-core log
 *
 * This will be called when writing externally provided regions or when
@@ -1641,209 +1867,157 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
 */
 STATIC int
 xlog_write(
-        struct xfs_mount        *mp,
+        struct log              *log,
-        struct xfs_log_iovec    reg[],
+        struct xfs_log_vec      *log_vector,
-        int                     nentries,
        struct xlog_ticket      *ticket,
        xfs_lsn_t               *start_lsn,
        struct xlog_in_core     **commit_iclog,
        uint                    flags)
 {
-    xlog_t           *log = mp->m_log;
+        struct xlog_in_core     *iclog = NULL;
-    xlog_in_core_t   *iclog = NULL;  /* ptr to current in-core log */
+        struct xfs_log_iovec    *vecp;
-    xlog_op_header_t *logop_head;    /* ptr to log operation header */
+        struct xfs_log_vec      *lv;
-    __psint_t        ptr;            /* copy address into data region */
+        int                     len;
-    int              len;            /* # xlog_write() bytes 2 still copy */
+        int                     index;
-    int              index;          /* region index currently copying */
+        int                     partial_copy = 0;
-    int              log_offset;     /* offset (from 0) into data region */
+        int                     partial_copy_len = 0;
-    int              start_rec_copy; /* # bytes to copy for start record */
+        int                     contwr = 0;
-    int              partial_copy;   /* did we split a region? */
+        int                     record_cnt = 0;
-    int              partial_copy_len;/* # bytes copied if split region */
+        int                     data_cnt = 0;
-    int              need_copy;      /* # bytes need to memcpy this region */
+        int                     error;
-    int              copy_len;       /* # bytes actually memcpy'ing */
-    int              copy_off;       /* # bytes from entry start */
-    int              contwr;         /* continued write of in-core log? */
-    int              error;
-    int              record_cnt = 0, data_cnt = 0;
-    partial_copy_len = partial_copy = 0;
-    /* Calculate potential maximum space.  Each region gets its own
-     * xlog_op_header_t and may need to be double word aligned.
-     */
-    len = 0;
-    if (ticket->t_flags & XLOG_TIC_INITED) {    /* acct for start rec of xact */
-        len += sizeof(xlog_op_header_t);
-        ticket->t_res_num_ophdrs++;
-    }
-    for (index = 0; index < nentries; index++) {
+        *start_lsn = 0;
-        len += sizeof(xlog_op_header_t);            /* each region gets >= 1 */
-        ticket->t_res_num_ophdrs++;
-        len += reg[index].i_len;
-        xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type);
-    }
-    contwr = *start_lsn = 0;
-    if (ticket->t_curr_res < len) {
+        len = xlog_write_calc_vec_length(ticket, log_vector);
-        xlog_print_tic_res(mp, ticket);
+        if (ticket->t_curr_res < len) {
+                xlog_print_tic_res(log->l_mp, ticket);
 #ifdef DEBUG
-        xlog_panic(
+                xlog_panic(
-                "xfs_log_write: reservation ran out. Need to up reservation");
+        "xfs_log_write: reservation ran out. Need to up reservation");
 #else
-        /* Customer configurable panic */
+                /* Customer configurable panic */
-        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+                xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp,
-                "xfs_log_write: reservation ran out. Need to up reservation");
+        "xfs_log_write: reservation ran out. Need to up reservation");
-        /* If we did not panic, shutdown the filesystem */
-        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+                /* If we did not panic, shutdown the filesystem */
+                xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE);
 #endif
-    } else
+        }
        ticket->t_curr_res -= len;
-    for (index = 0; index < nentries; ) {
+        index = 0;
-        if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+        lv = log_vector;
-                                               &contwr, &log_offset)))
+        vecp = lv->lv_iovecp;
-                return error;
+        while (lv && index < lv->lv_niovecs) {
+                void            *ptr;
+                int             log_offset;
-        ASSERT(log_offset <= iclog->ic_size - 1);
+                error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
-        ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset);
+                                                   &contwr, &log_offset);
+                if (error)
+                        return error;
-        /* start_lsn is the first lsn written to. That's all we need. */
+                ASSERT(log_offset <= iclog->ic_size - 1);
-        if (! *start_lsn)
+                ptr = iclog->ic_datap + log_offset;
-            *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
-        /* This loop writes out as many regions as can fit in the amount
+                /* start_lsn is the first lsn written to. That's all we need. */
-         * of space which was allocated by xlog_state_get_iclog_space().
+                if (!*start_lsn)
-         */
+                        *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
-        while (index < nentries) {
-            ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
-            ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
-            start_rec_copy = 0;
-            /* If first write for transaction, insert start record.
-             * We can't be trying to commit if we are inited.  We can't
-             * have any "partial_copy" if we are inited.
-             */
-            if (ticket->t_flags & XLOG_TIC_INITED) {
-                logop_head              = (xlog_op_header_t *)ptr;
-                logop_head->oh_tid      = cpu_to_be32(ticket->t_tid);
-                logop_head->oh_clientid = ticket->t_clientid;
-                logop_head->oh_len      = 0;
-                logop_head->oh_flags    = XLOG_START_TRANS;
-                logop_head->oh_res2     = 0;
-                ticket->t_flags         &= ~XLOG_TIC_INITED;    /* clear bit */
-                record_cnt++;
-                start_rec_copy = sizeof(xlog_op_header_t);
-                xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
-            }
-            /* Copy log operation header directly into data section */
+                /*
-            logop_head                  = (xlog_op_header_t *)ptr;
+                 * This loop writes out as many regions as can fit in the amount
-            logop_head->oh_tid          = cpu_to_be32(ticket->t_tid);
+                 * of space which was allocated by xlog_state_get_iclog_space().
-            logop_head->oh_clientid     = ticket->t_clientid;
+                 */
-            logop_head->oh_res2         = 0;
+                while (lv && index < lv->lv_niovecs) {
+                        struct xfs_log_iovec    *reg = &vecp[index];
+                        struct xlog_op_header   *ophdr;
+                        int                     start_rec_copy;
+                        int                     copy_len;
+                        int                     copy_off;
+                        ASSERT(reg->i_len % sizeof(__int32_t) == 0);
+                        ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
+                        start_rec_copy = xlog_write_start_rec(ptr, ticket);
+                        if (start_rec_copy) {
+                                record_cnt++;
+                                xlog_write_adv_cnt(&ptr, &len, &log_offset,
+                                                   start_rec_copy);
+                        }
-            /* header copied directly */
+                        ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
-            xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t));
+                        if (!ophdr)
+                                return XFS_ERROR(EIO);
-            /* are we copying a commit or unmount record? */
+                        xlog_write_adv_cnt(&ptr, &len, &log_offset,
-            logop_head->oh_flags = flags;
+                                           sizeof(struct xlog_op_header));
+                        len += xlog_write_setup_copy(ticket, ophdr,
+                                                     iclog->ic_size-log_offset,
+                                                     reg->i_len,
+                                                     &copy_off, &copy_len,
+                                                     &partial_copy,
+                                                     &partial_copy_len);
+                        xlog_verify_dest_ptr(log, ptr);
+                        /* copy region */
+                        ASSERT(copy_len >= 0);
+                        memcpy(ptr, reg->i_addr + copy_off, copy_len);
+                        xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
+                        copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+                        record_cnt++;
+                        data_cnt += contwr ? copy_len : 0;
+                        error = xlog_write_copy_finish(log, iclog, flags,
+                                                       &record_cnt, &data_cnt,
+                                                       &partial_copy,
+                                                       &partial_copy_len,
+                                                       log_offset,
+                                                       commit_iclog);
+                        if (error)
+                                return error;
-            /*
+                        /*
-             * We've seen logs corrupted with bad transaction client
+                         * if we had a partial copy, we need to get more iclog
-             * ids.  This makes sure that XFS doesn't generate them on.
+                         * space but we don't want to increment the region
-             * Turn this into an EIO and shut down the filesystem.
+                         * index because there is still more is this region to
-             */
+                         * write.
-            switch (logop_head->oh_clientid)  {
+                         *
-            case XFS_TRANSACTION:
+                         * If we completed writing this region, and we flushed
-            case XFS_VOLUME:
+                         * the iclog (indicated by resetting of the record
-            case XFS_LOG:
+                         * count), then we also need to get more log space. If
-                break;
+                         * this was the last record, though, we are done and
-            default:
+                         * can just return.
-                xfs_fs_cmn_err(CE_WARN, mp,
+                         */
-                    "Bad XFS transaction clientid 0x%x in ticket 0x%p",
+                        if (partial_copy)
-                    logop_head->oh_clientid, ticket);
+                                break;
-                return XFS_ERROR(EIO);
-            }
-            /* Partial write last time? => (partial_copy != 0)
+                        if (++index == lv->lv_niovecs) {
-             * need_copy is the amount we'd like to copy if everything could
+                                lv = lv->lv_next;
-             * fit in the current memcpy.
+                                index = 0;
-             */
+                                if (lv)
-            need_copy = reg[index].i_len - partial_copy_len;
+                                        vecp = lv->lv_iovecp;
+                        }
-            copy_off = partial_copy_len;
+                        if (record_cnt == 0) {
-            if (need_copy <= iclog->ic_size - log_offset) { /*complete write */
+                                if (!lv)
-                copy_len = need_copy;
+                                        return 0;
-                logop_head->oh_len = cpu_to_be32(copy_len);
+                                break;
-                if (partial_copy)
+                        }
-                    logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
-                partial_copy_len = partial_copy = 0;
-            } else {                                        /* partial write */
-                copy_len = iclog->ic_size - log_offset;
-                logop_head->oh_len = cpu_to_be32(copy_len);
-                logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
-                if (partial_copy)
-                        logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
-                partial_copy_len += copy_len;
-                partial_copy++;
-                len += sizeof(xlog_op_header_t); /* from splitting of region */
-                /* account for new log op header */
-                ticket->t_curr_res -= sizeof(xlog_op_header_t);
-                ticket->t_res_num_ophdrs++;
-            }
-            xlog_verify_dest_ptr(log, ptr);
-            /* copy region */
-            ASSERT(copy_len >= 0);
-            memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
-            xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
-            /* make copy_len total bytes copied, including headers */
-            copy_len += start_rec_copy + sizeof(xlog_op_header_t);
-            record_cnt++;
-            data_cnt += contwr ? copy_len : 0;
-            if (partial_copy) {                 /* copied partial region */
-                    /* already marked WANT_SYNC by xlog_state_get_iclog_space */
-                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-                    record_cnt = data_cnt = 0;
-                    if ((error = xlog_state_release_iclog(log, iclog)))
-                            return error;
-                    break;                      /* don't increment index */
-            } else {                            /* copied entire region */
-                index++;
-                partial_copy_len = partial_copy = 0;
-                if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
-                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-                    record_cnt = data_cnt = 0;
-                    spin_lock(&log->l_icloglock);
-                    xlog_state_want_sync(log, iclog);
-                    spin_unlock(&log->l_icloglock);
-                    if (commit_iclog) {
-                        ASSERT(flags & XLOG_COMMIT_TRANS);
-                        *commit_iclog = iclog;
-                    } else if ((error = xlog_state_release_iclog(log, iclog)))
-                           return error;
-                    if (index == nentries)
-                            return 0;           /* we are done */
-                    else
-                            break;
                }
-            } /* if (partial_copy) */
+        }
-        } /* while (index < nentries) */
-    } /* for (index = 0; index < nentries; ) */
+        ASSERT(len == 0);
-    ASSERT(len == 0);
+        xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+        if (!commit_iclog)
+                return xlog_state_release_iclog(log, iclog);
-    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-    if (commit_iclog) {
        ASSERT(flags & XLOG_COMMIT_TRANS);
        *commit_iclog = iclog;
        return 0;
-    }
+}
-    return xlog_state_release_iclog(log, iclog);
-}       /* xlog_write */
 /*****************************************************************************
@@ -3143,14 +3317,16 @@ xfs_log_ticket_get(
 * Allocate and initialise a new log ticket.
 */
 STATIC xlog_ticket_t *
-xlog_ticket_alloc(xlog_t                *log,
+xlog_ticket_alloc(
-                int             unit_bytes,
+        struct log      *log,
-                int             cnt,
+        int             unit_bytes,
-                char            client,
+        int             cnt,
-                uint            xflags)
+        char            client,
+        uint            xflags)
 {
-        xlog_ticket_t   *tic;
+        struct xlog_ticket *tic;
        uint            num_headers;
+        int             iclog_space;
        tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
        if (!tic)
@@ -3194,16 +3370,40 @@ xlog_ticket_alloc(xlog_t		*log,
        /* for start-rec */
        unit_bytes += sizeof(xlog_op_header_t);
-        /* for LR headers */
+        /*
-        num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
+         * for LR headers - the space for data in an iclog is the size minus
+         * the space used for the headers. If we use the iclog size, then we
+         * undercalculate the number of headers required.
+         *
+         * Furthermore - the addition of op headers for split-recs might
+         * increase the space required enough to require more log and op
+         * headers, so take that into account too.
+         *
+         * IMPORTANT: This reservation makes the assumption that if this
+         * transaction is the first in an iclog and hence has the LR headers
+         * accounted to it, then the remaining space in the iclog is
+         * exclusively for this transaction.  i.e. if the transaction is larger
+         * than the iclog, it will be the only thing in that iclog.
+         * Fundamentally, this means we must pass the entire log vector to
+         * xlog_write to guarantee this.
+         */
+        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+        num_headers = howmany(unit_bytes, iclog_space);
+        /* for split-recs - ophdrs added when data split over LRs */
+        unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+        /* add extra header reservations if we overrun */
+        while (!num_headers ||
+               howmany(unit_bytes, iclog_space) > num_headers) {
+                unit_bytes += sizeof(xlog_op_header_t);
+                num_headers++;
+        }
        unit_bytes += log->l_iclog_hsize * num_headers;
        /* for commit-rec LR header - note: padding will subsume the ophdr */
        unit_bytes += log->l_iclog_hsize;
-        /* for split-recs - ophdrs added when data split over LRs */
-        unit_bytes += sizeof(xlog_op_header_t) * num_headers;
        /* for roundoff padding for transaction data and one for commit record */
        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
            log->l_mp->m_sb.sb_logsunit > 1) {
@@ -3219,13 +3419,13 @@ xlog_ticket_alloc(xlog_t		*log,
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
        tic->t_ocnt             = cnt;
-        tic->t_tid              = (xlog_tid_t)((__psint_t)tic & 0xffffffff);
+        tic->t_tid              = random32();
        tic->t_clientid         = client;
        tic->t_flags            = XLOG_TIC_INITED;
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&(tic->t_wait), SV_DEFAULT, "logtick");
+        sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
        xlog_tic_reset_res(tic);
@@ -3246,20 +3446,22 @@ xlog_ticket_alloc(xlog_t		*log,
 * part of the log in case we trash the log structure.
 */
 void
-xlog_verify_dest_ptr(xlog_t     *log,
+xlog_verify_dest_ptr(
-                     __psint_t  ptr)
+        struct log      *log,
+        char            *ptr)
 {
        int i;
        int good_ptr = 0;
-        for (i=0; i < log->l_iclog_bufs; i++) {
+        for (i = 0; i < log->l_iclog_bufs; i++) {
-                if (ptr >= (__psint_t)log->l_iclog_bak[i] &&
+                if (ptr >= log->l_iclog_bak[i] &&
-                    ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size)
+                    ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
                        good_ptr++;
        }
-        if (! good_ptr)
+        if (!good_ptr)
                xlog_panic("xlog_verify_dest_ptr: invalid ptr");
-}       /* xlog_verify_dest_ptr */
+}
 STATIC void
 xlog_verify_grant_head(xlog_t *log, int equals)
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 97a24c7795a4..229d1f36ba9a 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -110,6 +110,12 @@ typedef struct xfs_log_iovec {
        uint            i_type;         /* type of region */
 } xfs_log_iovec_t;
+struct xfs_log_vec {
+        struct xfs_log_vec      *lv_next;       /* next lv in build list */
+        int                     lv_niovecs;     /* number of iovecs in lv */
+        struct xfs_log_iovec    *lv_iovecp;     /* iovec array */
+};
 /*
 * Structure used to pass callback function and the function's argument
 * to the log manager.
@@ -126,6 +132,13 @@ typedef struct xfs_log_callback {
 struct xfs_mount;
 struct xlog_in_core;
 struct xlog_ticket;
+struct xfs_log_item;
+struct xfs_item_ops;
+void    xfs_log_item_init(struct xfs_mount      *mp,
+                        struct xfs_log_item     *item,
+                        int                     type,
+                        struct xfs_item_ops     *ops);
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
                       struct xlog_ticket *ticket,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index fd02a18facd5..9cf695154451 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -396,9 +396,7 @@ typedef struct log {
        struct xfs_buf_cancel   **l_buf_cancel_table;
        int                     l_iclog_hsize;  /* size of iclog header */
        int                     l_iclog_heads;  /* # of iclog header sectors */
-        uint                    l_sectbb_log;   /* log2 of sector size in BBs */
+        uint                    l_sectBBsize;   /* sector size in BBs (2^n) */
-        uint                    l_sectbb_mask;  /* sector size (in BBs)
-                                                 * alignment mask */
        int                     l_iclog_size;   /* size of log in bytes */
        int                     l_iclog_size_log; /* log power size of log */
        int                     l_iclog_bufs;   /* number of iclog buffers */
@@ -449,6 +447,14 @@ extern void	 xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
 extern kmem_zone_t      *xfs_log_ticket_zone;
+static inline void
+xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
+{
+        *ptr += bytes;
+        *len -= bytes;
+        *off += bytes;
+}
 /*
 * Unmount record type is used as a pseudo transaction type for the ticket.
 * It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 22e6efdc17ea..0de08e366315 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -56,33 +56,61 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 #define xlog_recover_check_summary(log)
 #endif
 /*
 * Sector aligned buffer routines for buffer create/read/write/access
 */
-#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs)   \
+/*
-        ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
+ * Verify the given count of basic blocks is valid number of blocks
-        ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
+ * to specify for an operation involving the given XFS log buffer.
-#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)   ((bno) & ~(log)->l_sectbb_mask)
+ * Returns nonzero if the count is valid, 0 otherwise.
+ */
+static inline int
+xlog_buf_bbcount_valid(
+        xlog_t          *log,
+        int             bbcount)
+{
+        return bbcount > 0 && bbcount <= log->l_logBBsize;
+}
+/*
+ * Allocate a buffer to hold log data.  The buffer needs to be able
+ * to map to a range of nbblks basic blocks at any valid (basic
+ * block) offset within the log.
+ */
 STATIC xfs_buf_t *
 xlog_get_bp(
        xlog_t          *log,
        int             nbblks)
 {
-        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
-                XFS_ERROR_REPORT("xlog_get_bp(1)",
+                        nbblks);
-                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return NULL;
        }
-        if (log->l_sectbb_log) {
+        /*
-                if (nbblks > 1)
+         * We do log I/O in units of log sectors (a power-of-2
-                        nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+         * multiple of the basic block size), so we round up the
-                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
+         * requested size to acommodate the basic blocks required
-        }
+         * for complete log sectors.
+         *
+         * In addition, the buffer may be used for a non-sector-
+         * aligned block offset, in which case an I/O of the
+         * requested size could extend beyond the end of the
+         * buffer.  If the requested size is only 1 basic block it
+         * will never straddle a sector boundary, so this won't be
+         * an issue.  Nor will this be a problem if the log I/O is
+         * done in basic blocks (sector size 1).  But otherwise we
+         * extend the buffer by one extra log sector to ensure
+         * there's space to accomodate this possiblility.
+         */
+        if (nbblks > 1 && log->l_sectBBsize > 1)
+                nbblks += log->l_sectBBsize;
+        nbblks = round_up(nbblks, log->l_sectBBsize);
        return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
 }
@@ -93,6 +121,10 @@ xlog_put_bp(
        xfs_buf_free(bp);
 }
+/*
+ * Return the address of the start of the given block number's data
+ * in a log buffer.  The buffer covers a log sector-aligned region.
+ */
 STATIC xfs_caddr_t
 xlog_align(
        xlog_t          *log,
@@ -100,14 +132,14 @@ xlog_align(
        int             nbblks,
        xfs_buf_t       *bp)
 {
+        xfs_daddr_t     offset;
        xfs_caddr_t     ptr;
-        if (!log->l_sectbb_log)
+        offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1);
-                return XFS_BUF_PTR(bp);
+        ptr = XFS_BUF_PTR(bp) + BBTOB(offset);
+        ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
-        ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
-        ASSERT(XFS_BUF_SIZE(bp) >=
-                BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
        return ptr;
 }
@@ -124,21 +156,18 @@ xlog_bread_noalign(
 {
        int             error;
-        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
-                XFS_ERROR_REPORT("xlog_bread(1)",
+                        nbblks);
-                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
        }
-        if (log->l_sectbb_log) {
+        blk_no = round_down(blk_no, log->l_sectBBsize);
-                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
+        nbblks = round_up(nbblks, log->l_sectBBsize);
-                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
-        }
        ASSERT(nbblks > 0);
        ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
-        ASSERT(bp);
        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
        XFS_BUF_READ(bp);
@@ -186,17 +215,15 @@ xlog_bwrite(
 {
        int             error;
-        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
-                XFS_ERROR_REPORT("xlog_bwrite(1)",
+                        nbblks);
-                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
        }
-        if (log->l_sectbb_log) {
+        blk_no = round_down(blk_no, log->l_sectBBsize);
-                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
+        nbblks = round_up(nbblks, log->l_sectBBsize);
-                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
-        }
        ASSERT(nbblks > 0);
        ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
@@ -327,39 +354,38 @@ xlog_find_cycle_start(
 {
        xfs_caddr_t     offset;
        xfs_daddr_t     mid_blk;
+        xfs_daddr_t     end_blk;
        uint            mid_cycle;
        int             error;
-        mid_blk = BLK_AVG(first_blk, *last_blk);
+        end_blk = *last_blk;
-        while (mid_blk != first_blk && mid_blk != *last_blk) {
+        mid_blk = BLK_AVG(first_blk, end_blk);
+        while (mid_blk != first_blk && mid_blk != end_blk) {
                error = xlog_bread(log, mid_blk, 1, bp, &offset);
                if (error)
                        return error;
                mid_cycle = xlog_get_cycle(offset);
-                if (mid_cycle == cycle) {
+                if (mid_cycle == cycle)
-                        *last_blk = mid_blk;
+                        end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
-                        /* last_half_cycle == mid_cycle */
+                else
-                } else {
+                        first_blk = mid_blk; /* first_half_cycle == mid_cycle */
-                        first_blk = mid_blk;
+                mid_blk = BLK_AVG(first_blk, end_blk);
-                        /* first_half_cycle == mid_cycle */
-                }
-                mid_blk = BLK_AVG(first_blk, *last_blk);
        }
-        ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
+        ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
-               (mid_blk == *last_blk && mid_blk-1 == first_blk));
+               (mid_blk == end_blk && mid_blk-1 == first_blk));
+        *last_blk = end_blk;
        return 0;
 }
 /*
- * Check that the range of blocks does not contain the cycle number
+ * Check that a range of blocks does not contain stop_on_cycle_no.
- * given.  The scan needs to occur from front to back and the ptr into the
+ * Fill in *new_blk with the block offset where such a block is
- * region must be updated since a later routine will need to perform another
+ * found, or with -1 (an invalid block number) if there is no such
- * test.  If the region is completely good, we end up returning the same
+ * block in the range.  The scan needs to occur from front to back
- * last block number.
+ * and the pointer into the region must be updated since a later
- *
+ * routine will need to perform another test.
- * Set blkno to -1 if we encounter no errors.  This is an invalid block number
- * since we don't ever expect logs to get this large.
 */
 STATIC int
 xlog_find_verify_cycle(
@@ -376,12 +402,16 @@ xlog_find_verify_cycle(
        xfs_caddr_t     buf = NULL;
        int             error = 0;
+        /*
+         * Greedily allocate a buffer big enough to handle the full
+         * range of basic blocks we'll be examining.  If that fails,
+         * try a smaller size.  We need to be able to read at least
+         * a log sector, or we're out of luck.
+         */
        bufblks = 1 << ffs(nbblks);
        while (!(bp = xlog_get_bp(log, bufblks))) {
-                /* can't get enough memory to do everything in one big buffer */
                bufblks >>= 1;
-                if (bufblks <= log->l_sectbb_log)
+                if (bufblks < log->l_sectBBsize)
                        return ENOMEM;
        }
@@ -629,7 +659,7 @@ xlog_find_head(
                 * In this case we want to find the first block with cycle
                 * number matching last_half_cycle.  We expect the log to be
                 * some variation on
-                 *        x + 1 ... | x ...
+                 *        x + 1 ... | x ... | x
                 * The first block with cycle number x (last_half_cycle) will
                 * be where the new head belongs.  First we do a binary search
                 * for the first occurrence of last_half_cycle.  The binary
@@ -639,11 +669,13 @@ xlog_find_head(
                 * the log, then we look for occurrences of last_half_cycle - 1
                 * at the end of the log.  The cases we're looking for look
                 * like
-                 *        x + 1 ... | x | x + 1 | x ...
+                 *                               v binary search stopped here
-                 *                               ^ binary search stopped here
+                 *        x + 1 ... | x | x + 1 | x ... | x
+                 *                   ^ but we want to locate this spot
                 * or
-                 *        x + 1 ... | x ... | x - 1 | x
                 *        <---------> less than scan distance
+                 *        x + 1 ... | x ... | x - 1 | x
+                 *                           ^ we want to locate this spot
                 */
                stop_on_cycle = last_half_cycle;
                if ((error = xlog_find_cycle_start(log, bp, first_blk,
@@ -699,16 +731,16 @@ xlog_find_head(
                 * certainly not the head of the log.  By searching for
                 * last_half_cycle-1 we accomplish that.
                 */
-                start_blk = log_bbnum - num_scan_bblks + head_blk;
                ASSERT(head_blk <= INT_MAX &&
-                        (xfs_daddr_t) num_scan_bblks - head_blk >= 0);
+                        (xfs_daddr_t) num_scan_bblks >= head_blk);
+                start_blk = log_bbnum - (num_scan_bblks - head_blk);
                if ((error = xlog_find_verify_cycle(log, start_blk,
                                        num_scan_bblks - (int)head_blk,
                                        (stop_on_cycle - 1), &new_blk)))
                        goto bp_err;
                if (new_blk != -1) {
                        head_blk = new_blk;
-                        goto bad_blk;
+                        goto validate_head;
                }
                /*
@@ -726,7 +758,7 @@ xlog_find_head(
                        head_blk = new_blk;
        }
- bad_blk:
+validate_head:
        /*
         * Now we need to make sure head_blk is not pointing to a block in
         * the middle of a log record.
@@ -748,7 +780,7 @@ xlog_find_head(
                if ((error = xlog_find_verify_log_record(log, start_blk,
                                                        &head_blk, 0)) == -1) {
                        /* We hit the beginning of the log during our search */
-                        start_blk = log_bbnum - num_scan_bblks + head_blk;
+                        start_blk = log_bbnum - (num_scan_bblks - head_blk);
                        new_blk = log_bbnum;
                        ASSERT(start_blk <= INT_MAX &&
                                (xfs_daddr_t) log_bbnum-start_blk >= 0);
@@ -833,12 +865,12 @@ xlog_find_tail(
        if (*head_blk == 0) {                           /* special case */
                error = xlog_bread(log, 0, 1, bp, &offset);
                if (error)
-                        goto bread_err;
+                        goto done;
                if (xlog_get_cycle(offset) == 0) {
                        *tail_blk = 0;
                        /* leave all other log inited values alone */
-                        goto exit;
+                        goto done;
                }
        }
@@ -849,7 +881,7 @@ xlog_find_tail(
        for (i = (int)(*head_blk) - 1; i >= 0; i--) {
                error = xlog_bread(log, i, 1, bp, &offset);
                if (error)
-                        goto bread_err;
+                        goto done;
                if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
                        found = 1;
@@ -866,7 +898,7 @@ xlog_find_tail(
                for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
                        error = xlog_bread(log, i, 1, bp, &offset);
                        if (error)
-                                goto bread_err;
+                                goto done;
                        if (XLOG_HEADER_MAGIC_NUM ==
                            be32_to_cpu(*(__be32 *)offset)) {
@@ -941,7 +973,7 @@ xlog_find_tail(
                umount_data_blk = (i + hblks) % log->l_logBBsize;
                error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
                if (error)
-                        goto bread_err;
+                        goto done;
                op_head = (xlog_op_header_t *)offset;
                if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
@@ -987,12 +1019,10 @@ xlog_find_tail(
         * But... if the -device- itself is readonly, just skip this.
         * We can't recover this device anyway, so it won't matter.
         */
-        if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+        if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
                error = xlog_clear_stale_blocks(log, tail_lsn);
-        }
-bread_err:
+done:
-exit:
        xlog_put_bp(bp);
        if (error)
@@ -1152,16 +1182,22 @@ xlog_write_log_records(
        xfs_caddr_t     offset;
        xfs_buf_t       *bp;
        int             balign, ealign;
-        int             sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+        int             sectbb = log->l_sectBBsize;
        int             end_block = start_block + blocks;
        int             bufblks;
        int             error = 0;
        int             i, j = 0;
+        /*
+         * Greedily allocate a buffer big enough to handle the full
+         * range of basic blocks to be written.  If that fails, try
+         * a smaller size.  We need to be able to write at least a
+         * log sector, or we're out of luck.
+         */
        bufblks = 1 << ffs(blocks);
        while (!(bp = xlog_get_bp(log, bufblks))) {
                bufblks >>= 1;
-                if (bufblks <= log->l_sectbb_log)
+                if (bufblks < sectbb)
                        return ENOMEM;
        }
@@ -1169,7 +1205,7 @@ xlog_write_log_records(
         * the buffer in the starting sector not covered by the first
         * write below.
         */
-        balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
+        balign = round_down(start_block, sectbb);
        if (balign != start_block) {
                error = xlog_bread_noalign(log, start_block, 1, bp);
                if (error)
@@ -1188,7 +1224,7 @@ xlog_write_log_records(
                 * the buffer in the final sector not covered by the write.
                 * If this is the same sector as the above read, skip it.
                 */
-                ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
+                ealign = round_down(end_block, sectbb);
                if (j == 0 && (start_block + endcount > ealign)) {
                        offset = XFS_BUF_PTR(bp);
                        balign = BBTOB(ealign - start_block);
@@ -1408,6 +1444,7 @@ xlog_recover_add_item(
 STATIC int
 xlog_recover_add_to_cont_trans(
+        struct log              *log,
        xlog_recover_t          *trans,
        xfs_caddr_t             dp,
        int                     len)
@@ -1434,6 +1471,7 @@ xlog_recover_add_to_cont_trans(
        memcpy(&ptr[old_len], dp, len); /* d, s, l */
        item->ri_buf[item->ri_cnt-1].i_len += len;
        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+        trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
        return 0;
 }
@@ -1452,6 +1490,7 @@ xlog_recover_add_to_cont_trans(
 */
 STATIC int
 xlog_recover_add_to_trans(
+        struct log              *log,
        xlog_recover_t          *trans,
        xfs_caddr_t             dp,
        int                     len)
@@ -1510,6 +1549,7 @@ xlog_recover_add_to_trans(
        item->ri_buf[item->ri_cnt].i_addr = ptr;
        item->ri_buf[item->ri_cnt].i_len  = len;
        item->ri_cnt++;
+        trace_xfs_log_recover_item_add(log, trans, item, 0);
        return 0;
 }
@@ -1521,7 +1561,9 @@ xlog_recover_add_to_trans(
 */
 STATIC int
 xlog_recover_reorder_trans(
-        xlog_recover_t          *trans)
+        struct log              *log,
+        xlog_recover_t          *trans,
+        int                     pass)
 {
        xlog_recover_item_t     *item, *n;
        LIST_HEAD(sort_list);
@@ -1535,6 +1577,8 @@ xlog_recover_reorder_trans(
                switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
                        if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
+                                trace_xfs_log_recover_item_reorder_head(log,
+                                                        trans, item, pass);
                                list_move(&item->ri_list, &trans->r_itemq);
                                break;
                        }
@@ -1543,6 +1587,8 @@ xlog_recover_reorder_trans(
                case XFS_LI_QUOTAOFF:
                case XFS_LI_EFD:
                case XFS_LI_EFI:
+                        trace_xfs_log_recover_item_reorder_tail(log,
+                                                        trans, item, pass);
                        list_move_tail(&item->ri_list, &trans->r_itemq);
                        break;
                default:
@@ -1592,8 +1638,10 @@ xlog_recover_do_buffer_pass1(
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-        if (!(flags & XFS_BLI_CANCEL))
+        if (!(flags & XFS_BLI_CANCEL)) {
+                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
                return;
+        }
        /*
         * Insert an xfs_buf_cancel record into the hash table of
@@ -1627,6 +1675,7 @@ xlog_recover_do_buffer_pass1(
        while (nextp != NULL) {
                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
                        nextp->bc_refcount++;
+                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
                        return;
                }
                prevp = nextp;
@@ -1640,6 +1689,7 @@ xlog_recover_do_buffer_pass1(
        bcp->bc_refcount = 1;
        bcp->bc_next = NULL;
        prevp->bc_next = bcp;
+        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
 }
 /*
@@ -1779,6 +1829,8 @@ xlog_recover_do_inode_buffer(
        unsigned int            *data_map = NULL;
        unsigned int            map_size = 0;
+        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                data_map = buf_f->blf_data_map;
@@ -1874,6 +1926,7 @@ xlog_recover_do_inode_buffer(
 /*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
+        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
        xfs_buf_t               *bp,
        xfs_buf_log_format_t    *buf_f)
@@ -1885,6 +1938,8 @@ xlog_recover_do_reg_buffer(
        unsigned int            map_size = 0;
        int                     error;
+        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                data_map = buf_f->blf_data_map;
@@ -2083,6 +2138,8 @@ xlog_recover_do_dquot_buffer(
 {
        uint                    type;
+        trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
        /*
         * Filesystems are required to send in quota flags at mount time.
         */
@@ -2103,7 +2160,7 @@ xlog_recover_do_dquot_buffer(
        if (log->l_quotaoffs_flag & type)
                return;
-        xlog_recover_do_reg_buffer(item, bp, buf_f);
+        xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
 }
 /*
@@ -2164,9 +2221,11 @@ xlog_recover_do_buffer_trans(
                 */
                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
                if (cancel) {
+                        trace_xfs_log_recover_buf_cancel(log, buf_f);
                        return 0;
                }
        }
+        trace_xfs_log_recover_buf_recover(log, buf_f);
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                blkno = buf_f->blf_blkno;
@@ -2204,7 +2263,7 @@ xlog_recover_do_buffer_trans(
                  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
-                xlog_recover_do_reg_buffer(item, bp, buf_f);
+                xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
        }
        if (error)
                return XFS_ERROR(error);
@@ -2284,8 +2343,10 @@ xlog_recover_do_inode_trans(
        if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
                                        in_f->ilf_len, 0)) {
                error = 0;
+                trace_xfs_log_recover_inode_cancel(log, in_f);
                goto error;
        }
+        trace_xfs_log_recover_inode_recover(log, in_f);
        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
                          XBF_LOCK);
@@ -2337,6 +2398,7 @@ xlog_recover_do_inode_trans(
                        /* do nothing */
                } else {
                        xfs_buf_relse(bp);
+                        trace_xfs_log_recover_inode_skip(log, in_f);
                        error = 0;
                        goto error;
                }
@@ -2758,11 +2820,12 @@ xlog_recover_do_trans(
        int                     error = 0;
        xlog_recover_item_t     *item;
-        error = xlog_recover_reorder_trans(trans);
+        error = xlog_recover_reorder_trans(log, trans, pass);
        if (error)
                return error;
        list_for_each_entry(item, &trans->r_itemq, ri_list) {
+                trace_xfs_log_recover_item_recover(log, trans, item, pass);
                switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
                        error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2919,8 +2982,9 @@ xlog_recover_process_data(
                                error = xlog_recover_unmount_trans(trans);
                                break;
                        case XLOG_WAS_CONT_TRANS:
-                                error = xlog_recover_add_to_cont_trans(trans,
+                                error = xlog_recover_add_to_cont_trans(log,
-                                                dp, be32_to_cpu(ohead->oh_len));
+                                                trans, dp,
+                                                be32_to_cpu(ohead->oh_len));
                                break;
                        case XLOG_START_TRANS:
                                xlog_warn(
@@ -2930,7 +2994,7 @@ xlog_recover_process_data(
                                break;
                        case 0:
                        case XLOG_CONTINUE_TRANS:
-                                error = xlog_recover_add_to_trans(trans,
+                                error = xlog_recover_add_to_trans(log, trans,
                                                dp, be32_to_cpu(ohead->oh_len));
                                break;
                        default:
@@ -3331,42 +3395,6 @@ xlog_pack_data(
        }
 }
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-STATIC void
-xlog_unpack_data_checksum(
-        xlog_rec_header_t       *rhead,
-        xfs_caddr_t             dp,
-        xlog_t                  *log)
-{
-        __be32                  *up = (__be32 *)dp;
-        uint                    chksum = 0;
-        int                     i;
-        /* divide length by 4 to get # words */
-        for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
-                chksum ^= be32_to_cpu(*up);
-                up++;
-        }
-        if (chksum != be32_to_cpu(rhead->h_chksum)) {
-            if (rhead->h_chksum ||
-                ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
-                    cmn_err(CE_DEBUG,
-                        "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
-                            be32_to_cpu(rhead->h_chksum), chksum);
-                    cmn_err(CE_DEBUG,
-"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
-                    if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-                            cmn_err(CE_DEBUG,
-                                "XFS: LogR this is a LogV2 filesystem\n");
-                    }
-                    log->l_flags |= XLOG_CHKSUM_MISMATCH;
-            }
-        }
-}
-#else
-#define xlog_unpack_data_checksum(rhead, dp, log)
-#endif
 STATIC void
 xlog_unpack_data(
        xlog_rec_header_t       *rhead,
@@ -3390,8 +3418,6 @@ xlog_unpack_data(
                        dp += BBSIZE;
                }
        }
-        xlog_unpack_data_checksum(rhead, dp, log);
 }
 STATIC int
@@ -3490,7 +3516,7 @@ xlog_do_recovery_pass(
                        hblks = 1;
                }
        } else {
-                ASSERT(log->l_sectbb_log == 0);
+                ASSERT(log->l_sectBBsize == 1);
                hblks = 1;
                hbp = xlog_get_bp(log, 1);
                h_size = XLOG_BIG_RECORD_BSIZE;
@@ -3946,10 +3972,6 @@ xlog_recover_check_summary(
        xfs_agf_t       *agfp;
        xfs_buf_t       *agfbp;
        xfs_buf_t       *agibp;
-        xfs_buf_t       *sbbp;
-#ifdef XFS_LOUD_RECOVERY
-        xfs_sb_t        *sbp;
-#endif
        xfs_agnumber_t  agno;
        __uint64_t      freeblks;
        __uint64_t      itotal;
@@ -3984,30 +4006,5 @@ xlog_recover_check_summary(
                        xfs_buf_relse(agibp);
                }
        }
-        sbbp = xfs_getsb(mp, 0);
-#ifdef XFS_LOUD_RECOVERY
-        sbp = &mp->m_sb;
-        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
-        cmn_err(CE_NOTE,
-                "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
-                sbp->sb_icount, itotal);
-        cmn_err(CE_NOTE,
-                "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
-                sbp->sb_ifree, ifree);
-        cmn_err(CE_NOTE,
-                "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
-                sbp->sb_fdblocks, freeblks);
-#if 0
-        /*
-         * This is turned off until I account for the allocation
-         * btree blocks which live in free space.
-         */
-        ASSERT(sbp->sb_icount == itotal);
-        ASSERT(sbp->sb_ifree == ifree);
-        ASSERT(sbp->sb_fdblocks == freeblks);
-#endif
-#endif
-        xfs_buf_relse(sbbp);
 }
 #endif /* DEBUG */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e79b56b4bca6..d7bf38c8cd1c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1405,13 +1405,6 @@ xfs_mountfs(
                xfs_qm_mount_quotas(mp);
        }
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-        if (XFS_IS_QUOTA_ON(mp))
-                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
-        else
-                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
-#endif
        /*
         * Now we are mounted, reserve a small amount of unused space for
         * privileged transactions. This is needed so that transaction
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4fa0bc7b983e..9ff48a16a7ee 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -259,6 +259,7 @@ typedef struct xfs_mount {
        wait_queue_head_t       m_wait_single_sync_task;
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
+        struct list_head        m_mplist;       /* inode shrinker mount list */
 } xfs_mount_t;
 /*
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fdcab3f81dde..e0e64b113bd6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat {
 #define XFS_QMOPT_FORCE_RES     0x0000010 /* ignore quota limits */
 #define XFS_QMOPT_DQSUSER       0x0000020 /* don't cache super users dquot */
 #define XFS_QMOPT_SBVERSION     0x0000040 /* change superblock version num */
-#define XFS_QMOPT_QUOTAOFF      0x0000080 /* quotas are being turned off */
-#define XFS_QMOPT_UMOUNTING     0x0000100 /* filesys is being unmounted */
-#define XFS_QMOPT_DOLOG         0x0000200 /* log buf changes (in quotacheck) */
 #define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
 #define XFS_QMOPT_DQREPAIR      0x0001000 /* repair dquot if damaged */
 #define XFS_QMOPT_GQUOTA        0x0002000 /* group dquot requested */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f73e358bae8d..be578ecb4af2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -45,23 +45,12 @@
 #include "xfs_trans_space.h"
 #include "xfs_inode_item.h"
-STATIC void     xfs_trans_apply_sb_deltas(xfs_trans_t *);
-STATIC uint     xfs_trans_count_vecs(xfs_trans_t *);
-STATIC void     xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
-STATIC void     xfs_trans_uncommit(xfs_trans_t *, uint);
-STATIC void     xfs_trans_committed(xfs_trans_t *, int);
-STATIC void     xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
-STATIC void     xfs_trans_free(xfs_trans_t *);
 kmem_zone_t     *xfs_trans_zone;
 /*
 * Reservation functions here avoid a huge stack in xfs_trans_init
 * due to register overflow from temporaries in the calculations.
 */
 STATIC uint
 xfs_calc_write_reservation(xfs_mount_t *mp)
 {
@@ -261,6 +250,19 @@ _xfs_trans_alloc(
 }
 /*
+ * Free the transaction structure.  If there is more clean up
+ * to do when the structure is freed, add it here.
+ */
+STATIC void
+xfs_trans_free(
+        xfs_trans_t     *tp)
+{
+        atomic_dec(&tp->t_mountp->m_active_trans);
+        xfs_trans_free_dqinfo(tp);
+        kmem_zone_free(xfs_trans_zone, tp);
+}
+/*
 * This is called to create a new transaction which will share the
 * permanent log reservation of the given transaction.  The remaining
 * unused block and rt extent reservations are also inherited.  This
@@ -764,94 +766,278 @@ xfs_trans_unreserve_and_mod_sb(
        }
 }
+/*
+ * Total up the number of log iovecs needed to commit this
+ * transaction.  The transaction itself needs one for the
+ * transaction header.  Ask each dirty item in turn how many
+ * it needs to get the total.
+ */
+static uint
+xfs_trans_count_vecs(
+        struct xfs_trans        *tp)
+{
+        int                     nvecs;
+        xfs_log_item_desc_t     *lidp;
+        nvecs = 1;
+        lidp = xfs_trans_first_item(tp);
+        ASSERT(lidp != NULL);
+        /* In the non-debug case we need to start bailing out if we
+         * didn't find a log_item here, return zero and let trans_commit
+         * deal with it.
+         */
+        if (lidp == NULL)
+                return 0;
+        while (lidp != NULL) {
+                /*
+                 * Skip items which aren't dirty in this transaction.
+                 */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+                        lidp = xfs_trans_next_item(tp, lidp);
+                        continue;
+                }
+                lidp->lid_size = IOP_SIZE(lidp->lid_item);
+                nvecs += lidp->lid_size;
+                lidp = xfs_trans_next_item(tp, lidp);
+        }
+        return nvecs;
+}
 /*
- * xfs_trans_commit
+ * Fill in the vector with pointers to data to be logged
+ * by this transaction.  The transaction header takes
+ * the first vector, and then each dirty item takes the
+ * number of vectors it indicated it needed in xfs_trans_count_vecs().
 *
- * Commit the given transaction to the log a/synchronously.
+ * As each item fills in the entries it needs, also pin the item
+ * so that it cannot be flushed out until the log write completes.
+ */
+static void
+xfs_trans_fill_vecs(
+        struct xfs_trans        *tp,
+        struct xfs_log_iovec    *log_vector)
+{
+        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_iovec    *vecp;
+        uint                    nitems;
+        /*
+         * Skip over the entry for the transaction header, we'll
+         * fill that in at the end.
+         */
+        vecp = log_vector + 1;
+        nitems = 0;
+        lidp = xfs_trans_first_item(tp);
+        ASSERT(lidp);
+        while (lidp) {
+                /* Skip items which aren't dirty in this transaction. */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+                        lidp = xfs_trans_next_item(tp, lidp);
+                        continue;
+                }
+                /*
+                 * The item may be marked dirty but not log anything.  This can
+                 * be used to get called when a transaction is committed.
+                 */
+                if (lidp->lid_size)
+                        nitems++;
+                IOP_FORMAT(lidp->lid_item, vecp);
+                vecp += lidp->lid_size;
+                IOP_PIN(lidp->lid_item);
+                lidp = xfs_trans_next_item(tp, lidp);
+        }
+        /*
+         * Now that we've counted the number of items in this transaction, fill
+         * in the transaction header. Note that the transaction header does not
+         * have a log item.
+         */
+        tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+        tp->t_header.th_type = tp->t_type;
+        tp->t_header.th_num_items = nitems;
+        log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+        log_vector->i_len = sizeof(xfs_trans_header_t);
+        log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
+}
+/*
+ * The committed item processing consists of calling the committed routine of
+ * each logged item, updating the item's position in the AIL if necessary, and
+ * unpinning each item.  If the committed routine returns -1, then do nothing
+ * further with the item because it may have been freed.
 *
- * XFS disk error handling mechanism is not based on a typical
+ * Since items are unlocked when they are copied to the incore log, it is
- * transaction abort mechanism. Logically after the filesystem
+ * possible for two transactions to be completing and manipulating the same
- * gets marked 'SHUTDOWN', we can't let any new transactions
+ * item simultaneously.  The AIL lock will protect the lsn field of each item.
- * be durable - ie. committed to disk - because some metadata might
+ * The value of this field can never go backwards.
- * be inconsistent. In such cases, this returns an error, and the
+ *
- * caller may assume that all locked objects joined to the transaction
+ * We unpin the items after repositioning them in the AIL, because otherwise
- * have already been unlocked as if the commit had succeeded.
+ * they could be immediately flushed and we'd have to race with the flusher
- * Do not reference the transaction structure after this call.
+ * trying to pull the item from the AIL as we add it.
 */
- /*ARGSUSED*/
+static void
-int
+xfs_trans_item_committed(
-_xfs_trans_commit(
+        struct xfs_log_item     *lip,
-        xfs_trans_t     *tp,
+        xfs_lsn_t               commit_lsn,
-        uint            flags,
+        int                     aborted)
-        int             *log_flushed)
 {
-        xfs_log_iovec_t         *log_vector;
+        xfs_lsn_t               item_lsn;
-        int                     nvec;
+        struct xfs_ail          *ailp;
-        xfs_mount_t             *mp;
-        xfs_lsn_t               commit_lsn;
-        /* REFERENCED */
-        int                     error;
-        int                     log_flags;
-        int                     sync;
-#define XFS_TRANS_LOGVEC_COUNT  16
-        xfs_log_iovec_t         log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
-        struct xlog_in_core     *commit_iclog;
-        int                     shutdown;
-        commit_lsn = -1;
+        if (aborted)
+                lip->li_flags |= XFS_LI_ABORTED;
+        item_lsn = IOP_COMMITTED(lip, commit_lsn);
+        /* If the committed routine returns -1, item has been freed. */
+        if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+                return;
        /*
-         * Determine whether this commit is releasing a permanent
+         * If the returned lsn is greater than what it contained before, update
-         * log reservation or not.
+         * the location of the item in the AIL.  If it is not, then do nothing.
+         * Items can never move backwards in the AIL.
+         *
+         * While the new lsn should usually be greater, it is possible that a
+         * later transaction completing simultaneously with an earlier one
+         * using the same item could complete first with a higher lsn.  This
+         * would cause the earlier transaction to fail the test below.
         */
-        if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+        ailp = lip->li_ailp;
-                ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+        spin_lock(&ailp->xa_lock);
-                log_flags = XFS_LOG_REL_PERM_RESERV;
+        if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
+                /*
+                 * This will set the item's lsn to item_lsn and update the
+                 * position of the item in the AIL.
+                 *
+                 * xfs_trans_ail_update() drops the AIL lock.
+                 */
+                xfs_trans_ail_update(ailp, lip, item_lsn);
        } else {
-                log_flags = 0;
+                spin_unlock(&ailp->xa_lock);
        }
-        mp = tp->t_mountp;
        /*
-         * If there is nothing to be logged by the transaction,
+         * Now that we've repositioned the item in the AIL, unpin it so it can
-         * then unlock all of the items associated with the
+         * be flushed. Pass information about buffer stale state down from the
-         * transaction and free the transaction structure.
+         * log item flags, if anyone else stales the buffer we do not want to
-         * Also make sure to return any reserved blocks to
+         * pay any attention to it.
-         * the free pool.
         */
-shut_us_down:
+        IOP_UNPIN(lip);
-        shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0;
+}
-        if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) {
-                xfs_trans_unreserve_and_mod_sb(tp);
+/* Clear all the per-AG busy list items listed in this transaction */
+static void
+xfs_trans_clear_busy_extents(
+        struct xfs_trans        *tp)
+{
+        xfs_log_busy_chunk_t    *lbcp;
+        xfs_log_busy_slot_t     *lbsp;
+        int                     i;
+        for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) {
+                i = 0;
+                for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
+                        if (XFS_LBC_ISFREE(lbcp, i))
+                                continue;
+                        xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx);
+                }
+        }
+        xfs_trans_free_busy(tp);
+}
+/*
+ * This is typically called by the LM when a transaction has been fully
+ * committed to disk.  It needs to unpin the items which have
+ * been logged by the transaction and update their positions
+ * in the AIL if necessary.
+ *
+ * This also gets called when the transactions didn't get written out
+ * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
+ */
+STATIC void
+xfs_trans_committed(
+        struct xfs_trans        *tp,
+        int                     abortflag)
+{
+        xfs_log_item_desc_t     *lidp;
+        xfs_log_item_chunk_t    *licp;
+        xfs_log_item_chunk_t    *next_licp;
+        /* Call the transaction's completion callback if there is one. */
+        if (tp->t_callback != NULL)
+                tp->t_callback(tp, tp->t_callarg);
+        for (lidp = xfs_trans_first_item(tp);
+             lidp != NULL;
+             lidp = xfs_trans_next_item(tp, lidp)) {
+                xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
+        }
+        /* free the item chunks, ignoring the embedded chunk */
+        for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) {
+                next_licp = licp->lic_next;
+                kmem_free(licp);
+        }
+        xfs_trans_clear_busy_extents(tp);
+        xfs_trans_free(tp);
+}
+/*
+ * Called from the trans_commit code when we notice that
+ * the filesystem is in the middle of a forced shutdown.
+ */
+STATIC void
+xfs_trans_uncommit(
+        struct xfs_trans        *tp,
+        uint                    flags)
+{
+        xfs_log_item_desc_t     *lidp;
+        for (lidp = xfs_trans_first_item(tp);
+             lidp != NULL;
+             lidp = xfs_trans_next_item(tp, lidp)) {
                /*
-                 * It is indeed possible for the transaction to be
+                 * Unpin all but those that aren't dirty.
-                 * not dirty but the dqinfo portion to be. All that
-                 * means is that we have some (non-persistent) quota
-                 * reservations that need to be unreserved.
                 */
-                xfs_trans_unreserve_and_mod_dquots(tp);
+                if (lidp->lid_flags & XFS_LID_DIRTY)
-                if (tp->t_ticket) {
+                        IOP_UNPIN_REMOVE(lidp->lid_item, tp);
-                        commit_lsn = xfs_log_done(mp, tp->t_ticket,
-                                                        NULL, log_flags);
-                        if (commit_lsn == -1 && !shutdown)
-                                shutdown = XFS_ERROR(EIO);
-                }
-                current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-                xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
-                xfs_trans_free_busy(tp);
-                xfs_trans_free(tp);
-                XFS_STATS_INC(xs_trans_empty);
-                return (shutdown);
        }
-        ASSERT(tp->t_ticket != NULL);
-        /*
+        xfs_trans_unreserve_and_mod_sb(tp);
-         * If we need to update the superblock, then do it now.
+        xfs_trans_unreserve_and_mod_dquots(tp);
-         */
-        if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+        xfs_trans_free_items(tp, flags);
-                xfs_trans_apply_sb_deltas(tp);
+        xfs_trans_free_busy(tp);
-        xfs_trans_apply_dquot_deltas(tp);
+        xfs_trans_free(tp);
+}
+/*
+ * Format the transaction direct to the iclog. This isolates the physical
+ * transaction commit operation from the logical operation and hence allows
+ * other methods to be introduced without affecting the existing commit path.
+ */
+static int
+xfs_trans_commit_iclog(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        xfs_lsn_t               *commit_lsn,
+        int                     flags)
+{
+        int                     shutdown;
+        int                     error;
+        int                     log_flags = 0;
+        struct xlog_in_core     *commit_iclog;
+#define XFS_TRANS_LOGVEC_COUNT  16
+        struct xfs_log_iovec    log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
+        struct xfs_log_iovec    *log_vector;
+        uint                    nvec;
        /*
         * Ask each log item how many log_vector entries it will
@@ -861,8 +1047,7 @@ shut_us_down:
         */
        nvec = xfs_trans_count_vecs(tp);
        if (nvec == 0) {
-                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+                return ENOMEM;  /* triggers a shutdown! */
-                goto shut_us_down;
        } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
                log_vector = log_vector_fast;
        } else {
@@ -877,6 +1062,9 @@ shut_us_down:
         */
        xfs_trans_fill_vecs(tp, log_vector);
+        if (flags & XFS_TRANS_RELEASE_LOG_RES)
+                log_flags = XFS_LOG_REL_PERM_RESERV;
        error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn));
        /*
@@ -884,18 +1072,17 @@ shut_us_down:
         * at any time after this call.  However, all the items associated
         * with the transaction are still locked and pinned in memory.
         */
-        commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
+        *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
-        tp->t_commit_lsn = commit_lsn;
+        tp->t_commit_lsn = *commit_lsn;
-        if (nvec > XFS_TRANS_LOGVEC_COUNT) {
+        if (nvec > XFS_TRANS_LOGVEC_COUNT)
                kmem_free(log_vector);
-        }
        /*
         * If we got a log write error. Unpin the logitems that we
         * had pinned, clean up, free trans structure, and return error.
         */
-        if (error || commit_lsn == -1) {
+        if (error || *commit_lsn == -1) {
                current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
                xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
                return XFS_ERROR(EIO);
@@ -909,8 +1096,6 @@ shut_us_down:
         */
        xfs_trans_unreserve_and_mod_sb(tp);
-        sync = tp->t_flags & XFS_TRANS_SYNC;
        /*
         * Tell the LM to call the transaction completion routine
         * when the log write with LSN commit_lsn completes (e.g.
@@ -953,7 +1138,7 @@ shut_us_down:
         * the commit lsn of this transaction for dependency tracking
         * purposes.
         */
-        xfs_trans_unlock_items(tp, commit_lsn);
+        xfs_trans_unlock_items(tp, *commit_lsn);
        /*
         * If we detected a log error earlier, finish committing
@@ -973,156 +1158,114 @@ shut_us_down:
         * and the items are released we can finally allow the iclog to
         * go to disk.
         */
-        error = xfs_log_release_iclog(mp, commit_iclog);
+        return xfs_log_release_iclog(mp, commit_iclog);
-        /*
-         * If the transaction needs to be synchronous, then force the
-         * log out now and wait for it.
-         */
-        if (sync) {
-                if (!error) {
-                        error = _xfs_log_force_lsn(mp, commit_lsn,
-                                      XFS_LOG_SYNC, log_flushed);
-                }
-                XFS_STATS_INC(xs_trans_sync);
-        } else {
-                XFS_STATS_INC(xs_trans_async);
-        }
-        return (error);
 }
 /*
- * Total up the number of log iovecs needed to commit this
+ * xfs_trans_commit
- * transaction.  The transaction itself needs one for the
+ *
- * transaction header.  Ask each dirty item in turn how many
+ * Commit the given transaction to the log a/synchronously.
- * it needs to get the total.
+ *
+ * XFS disk error handling mechanism is not based on a typical
+ * transaction abort mechanism. Logically after the filesystem
+ * gets marked 'SHUTDOWN', we can't let any new transactions
+ * be durable - ie. committed to disk - because some metadata might
+ * be inconsistent. In such cases, this returns an error, and the
+ * caller may assume that all locked objects joined to the transaction
+ * have already been unlocked as if the commit had succeeded.
+ * Do not reference the transaction structure after this call.
 */
-STATIC uint
+int
-xfs_trans_count_vecs(
+_xfs_trans_commit(
-        xfs_trans_t     *tp)
+        struct xfs_trans        *tp,
+        uint                    flags,
+        int                     *log_flushed)
 {
-        int                     nvecs;
+        struct xfs_mount        *mp = tp->t_mountp;
-        xfs_log_item_desc_t     *lidp;
+        xfs_lsn_t               commit_lsn = -1;
+        int                     error = 0;
+        int                     log_flags = 0;
+        int                     sync = tp->t_flags & XFS_TRANS_SYNC;
-        nvecs = 1;
+        /*
-        lidp = xfs_trans_first_item(tp);
+         * Determine whether this commit is releasing a permanent
-        ASSERT(lidp != NULL);
+         * log reservation or not.
-        /* In the non-debug case we need to start bailing out if we
-         * didn't find a log_item here, return zero and let trans_commit
-         * deal with it.
         */
-        if (lidp == NULL)
+        if (flags & XFS_TRANS_RELEASE_LOG_RES) {
-                return 0;
+                ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+                log_flags = XFS_LOG_REL_PERM_RESERV;
-        while (lidp != NULL) {
-                /*
-                 * Skip items which aren't dirty in this transaction.
-                 */
-                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
-                        lidp = xfs_trans_next_item(tp, lidp);
-                        continue;
-                }
-                lidp->lid_size = IOP_SIZE(lidp->lid_item);
-                nvecs += lidp->lid_size;
-                lidp = xfs_trans_next_item(tp, lidp);
        }
-        return nvecs;
+        /*
-}
+         * If there is nothing to be logged by the transaction,
+         * then unlock all of the items associated with the
-/*
+         * transaction and free the transaction structure.
- * Called from the trans_commit code when we notice that
+         * Also make sure to return any reserved blocks to
- * the filesystem is in the middle of a forced shutdown.
+         * the free pool.
- */
+         */
-STATIC void
+        if (!(tp->t_flags & XFS_TRANS_DIRTY))
-xfs_trans_uncommit(
+                goto out_unreserve;
-        xfs_trans_t     *tp,
-        uint            flags)
-{
-        xfs_log_item_desc_t     *lidp;
-        for (lidp = xfs_trans_first_item(tp);
+        if (XFS_FORCED_SHUTDOWN(mp)) {
-             lidp != NULL;
+                error = XFS_ERROR(EIO);
-             lidp = xfs_trans_next_item(tp, lidp)) {
+                goto out_unreserve;
-                /*
-                 * Unpin all but those that aren't dirty.
-                 */
-                if (lidp->lid_flags & XFS_LID_DIRTY)
-                        IOP_UNPIN_REMOVE(lidp->lid_item, tp);
        }
-        xfs_trans_unreserve_and_mod_sb(tp);
+        ASSERT(tp->t_ticket != NULL);
-        xfs_trans_unreserve_and_mod_dquots(tp);
-        xfs_trans_free_items(tp, flags);
+        /*
-        xfs_trans_free_busy(tp);
+         * If we need to update the superblock, then do it now.
-        xfs_trans_free(tp);
+         */
-}
+        if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+                xfs_trans_apply_sb_deltas(tp);
+        xfs_trans_apply_dquot_deltas(tp);
-/*
+        error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
- * Fill in the vector with pointers to data to be logged
+        if (error == ENOMEM) {
- * by this transaction.  The transaction header takes
+                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
- * the first vector, and then each dirty item takes the
+                error = XFS_ERROR(EIO);
- * number of vectors it indicated it needed in xfs_trans_count_vecs().
+                goto out_unreserve;
- *
+        }
- * As each item fills in the entries it needs, also pin the item
- * so that it cannot be flushed out until the log write completes.
- */
-STATIC void
-xfs_trans_fill_vecs(
-        xfs_trans_t             *tp,
-        xfs_log_iovec_t         *log_vector)
-{
-        xfs_log_item_desc_t     *lidp;
-        xfs_log_iovec_t         *vecp;
-        uint                    nitems;
        /*
-         * Skip over the entry for the transaction header, we'll
+         * If the transaction needs to be synchronous, then force the
-         * fill that in at the end.
+         * log out now and wait for it.
         */
-        vecp = log_vector + 1;          /* pointer arithmetic */
+        if (sync) {
+                if (!error) {
-        nitems = 0;
+                        error = _xfs_log_force_lsn(mp, commit_lsn,
-        lidp = xfs_trans_first_item(tp);
+                                      XFS_LOG_SYNC, log_flushed);
-        ASSERT(lidp != NULL);
-        while (lidp != NULL) {
-                /*
-                 * Skip items which aren't dirty in this transaction.
-                 */
-                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
-                        lidp = xfs_trans_next_item(tp, lidp);
-                        continue;
-                }
-                /*
-                 * The item may be marked dirty but not log anything.
-                 * This can be used to get called when a transaction
-                 * is committed.
-                 */
-                if (lidp->lid_size) {
-                        nitems++;
                }
-                IOP_FORMAT(lidp->lid_item, vecp);
+                XFS_STATS_INC(xs_trans_sync);
-                vecp += lidp->lid_size;         /* pointer arithmetic */
+        } else {
-                IOP_PIN(lidp->lid_item);
+                XFS_STATS_INC(xs_trans_async);
-                lidp = xfs_trans_next_item(tp, lidp);
        }
+        return error;
+out_unreserve:
+        xfs_trans_unreserve_and_mod_sb(tp);
        /*
-         * Now that we've counted the number of items in this
+         * It is indeed possible for the transaction to be not dirty but
-         * transaction, fill in the transaction header.
+         * the dqinfo portion to be.  All that means is that we have some
+         * (non-persistent) quota reservations that need to be unreserved.
         */
-        tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+        xfs_trans_unreserve_and_mod_dquots(tp);
-        tp->t_header.th_type = tp->t_type;
+        if (tp->t_ticket) {
-        tp->t_header.th_num_items = nitems;
+                commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
-        log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+                if (commit_lsn == -1 && !error)
-        log_vector->i_len = sizeof(xfs_trans_header_t);
+                        error = XFS_ERROR(EIO);
-        log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
+        }
-}
+        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+        xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0);
+        xfs_trans_free_busy(tp);
+        xfs_trans_free(tp);
+        XFS_STATS_INC(xs_trans_empty);
+        return error;
+}
 /*
 * Unlock all of the transaction's items and free the transaction.
@@ -1200,20 +1343,6 @@ xfs_trans_cancel(
        xfs_trans_free(tp);
 }
-/*
- * Free the transaction structure.  If there is more clean up
- * to do when the structure is freed, add it here.
- */
-STATIC void
-xfs_trans_free(
-        xfs_trans_t     *tp)
-{
-        atomic_dec(&tp->t_mountp->m_active_trans);
-        xfs_trans_free_dqinfo(tp);
-        kmem_zone_free(xfs_trans_zone, tp);
-}
 /*
 * Roll from one trans in the sequence of PERMANENT transactions to
 * the next: permanent transactions are only flushed out when
@@ -1283,174 +1412,3 @@ xfs_trans_roll(
        xfs_trans_ihold(trans, dp);
        return 0;
 }
-/*
- * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
- *
- * This is typically called by the LM when a transaction has been fully
- * committed to disk.  It needs to unpin the items which have
- * been logged by the transaction and update their positions
- * in the AIL if necessary.
- * This also gets called when the transactions didn't get written out
- * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
- *
- * Call xfs_trans_chunk_committed() to process the items in
- * each chunk.
- */
-STATIC void
-xfs_trans_committed(
-        xfs_trans_t     *tp,
-        int             abortflag)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_chunk_t    *next_licp;
-        xfs_log_busy_chunk_t    *lbcp;
-        xfs_log_busy_slot_t     *lbsp;
-        int                     i;
-        /*
-         * Call the transaction's completion callback if there
-         * is one.
-         */
-        if (tp->t_callback != NULL) {
-                tp->t_callback(tp, tp->t_callarg);
-        }
-        /*
-         * Special case the chunk embedded in the transaction.
-         */
-        licp = &(tp->t_items);
-        if (!(xfs_lic_are_all_free(licp))) {
-                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
-        }
-        /*
-         * Process the items in each chunk in turn.
-         */
-        licp = licp->lic_next;
-        while (licp != NULL) {
-                ASSERT(!xfs_lic_are_all_free(licp));
-                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
-                next_licp = licp->lic_next;
-                kmem_free(licp);
-                licp = next_licp;
-        }
-        /*
-         * Clear all the per-AG busy list items listed in this transaction
-         */
-        lbcp = &tp->t_busy;
-        while (lbcp != NULL) {
-                for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
-                        if (!XFS_LBC_ISFREE(lbcp, i)) {
-                                xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
-                                                     lbsp->lbc_idx);
-                        }
-                }
-                lbcp = lbcp->lbc_next;
-        }
-        xfs_trans_free_busy(tp);
-        /*
-         * That's it for the transaction structure.  Free it.
-         */
-        xfs_trans_free(tp);
-}
-/*
- * This is called to perform the commit processing for each
- * item described by the given chunk.
- *
- * The commit processing consists of unlocking items which were
- * held locked with the SYNC_UNLOCK attribute, calling the committed
- * routine of each logged item, updating the item's position in the AIL
- * if necessary, and unpinning each item.  If the committed routine
- * returns -1, then do nothing further with the item because it
- * may have been freed.
- *
- * Since items are unlocked when they are copied to the incore
- * log, it is possible for two transactions to be completing
- * and manipulating the same item simultaneously.  The AIL lock
- * will protect the lsn field of each item.  The value of this
- * field can never go backwards.
- *
- * We unpin the items after repositioning them in the AIL, because
- * otherwise they could be immediately flushed and we'd have to race
- * with the flusher trying to pull the item from the AIL as we add it.
- */
-STATIC void
-xfs_trans_chunk_committed(
-        xfs_log_item_chunk_t    *licp,
-        xfs_lsn_t               lsn,
-        int                     aborted)
-{
-        xfs_log_item_desc_t     *lidp;
-        xfs_log_item_t          *lip;
-        xfs_lsn_t               item_lsn;
-        int                     i;
-        lidp = licp->lic_descs;
-        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                struct xfs_ail          *ailp;
-                if (xfs_lic_isfree(licp, i)) {
-                        continue;
-                }
-                lip = lidp->lid_item;
-                if (aborted)
-                        lip->li_flags |= XFS_LI_ABORTED;
-                /*
-                 * Send in the ABORTED flag to the COMMITTED routine
-                 * so that it knows whether the transaction was aborted
-                 * or not.
-                 */
-                item_lsn = IOP_COMMITTED(lip, lsn);
-                /*
-                 * If the committed routine returns -1, make
-                 * no more references to the item.
-                 */
-                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
-                        continue;
-                }
-                /*
-                 * If the returned lsn is greater than what it
-                 * contained before, update the location of the
-                 * item in the AIL.  If it is not, then do nothing.
-                 * Items can never move backwards in the AIL.
-                 *
-                 * While the new lsn should usually be greater, it
-                 * is possible that a later transaction completing
-                 * simultaneously with an earlier one using the
-                 * same item could complete first with a higher lsn.
-                 * This would cause the earlier transaction to fail
-                 * the test below.
-                 */
-                ailp = lip->li_ailp;
-                spin_lock(&ailp->xa_lock);
-                if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
-                        /*
-                         * This will set the item's lsn to item_lsn
-                         * and update the position of the item in
-                         * the AIL.
-                         *
-                         * xfs_trans_ail_update() drops the AIL lock.
-                         */
-                        xfs_trans_ail_update(ailp, lip, item_lsn);
-                } else {
-                        spin_unlock(&ailp->xa_lock);
-                }
-                /*
-                 * Now that we've repositioned the item in the AIL,
-                 * unpin it so it can be flushed. Pass information
-                 * about buffer stale state down from the log item
-                 * flags, if anyone else stales the buffer we do not
-                 * want to pay any attention to it.
-                 */
-                IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE);
-        }
-}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 79c8bab9dfff..c62beee0921e 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -49,6 +49,15 @@ typedef struct xfs_trans_header {
 #define XFS_LI_DQUOT            0x123d
 #define XFS_LI_QUOTAOFF         0x123e
+#define XFS_LI_TYPE_DESC \
+        { XFS_LI_EFI,           "XFS_LI_EFI" }, \
+        { XFS_LI_EFD,           "XFS_LI_EFD" }, \
+        { XFS_LI_IUNLINK,       "XFS_LI_IUNLINK" }, \
+        { XFS_LI_INODE,         "XFS_LI_INODE" }, \
+        { XFS_LI_BUF,           "XFS_LI_BUF" }, \
+        { XFS_LI_DQUOT,         "XFS_LI_DQUOT" }, \
+        { XFS_LI_QUOTAOFF,      "XFS_LI_QUOTAOFF" }
 /*
 * Transaction types.  Used to distinguish types of buffers.
 */
@@ -159,7 +168,6 @@ typedef struct xfs_log_item_desc {
 #define XFS_LID_DIRTY           0x1
 #define XFS_LID_PINNED          0x2
-#define XFS_LID_BUF_STALE       0x8
 /*
 * This structure is used to maintain a chunk list of log_item_desc
@@ -833,7 +841,7 @@ typedef struct xfs_item_ops {
        uint (*iop_size)(xfs_log_item_t *);
        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
        void (*iop_pin)(xfs_log_item_t *);
-        void (*iop_unpin)(xfs_log_item_t *, int);
+        void (*iop_unpin)(xfs_log_item_t *);
        void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
        uint (*iop_trylock)(xfs_log_item_t *);
        void (*iop_unlock)(xfs_log_item_t *);
@@ -846,7 +854,7 @@ typedef struct xfs_item_ops {
 #define IOP_SIZE(ip)            (*(ip)->li_ops->iop_size)(ip)
 #define IOP_FORMAT(ip,vp)       (*(ip)->li_ops->iop_format)(ip, vp)
 #define IOP_PIN(ip)             (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, flags)    (*(ip)->li_ops->iop_unpin)(ip, flags)
+#define IOP_UNPIN(ip)           (*(ip)->li_ops->iop_unpin)(ip)
 #define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
 #define IOP_TRYLOCK(ip)         (*(ip)->li_ops->iop_trylock)(ip)
 #define IOP_UNLOCK(ip)          (*(ip)->li_ops->iop_unlock)(ip)
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index fb586360d1c9..9cd809025f3a 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -40,11 +40,51 @@
 #include "xfs_rw.h"
 #include "xfs_trace.h"
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.
+ */
+STATIC struct xfs_buf *
+xfs_trans_buf_item_match(
+        struct xfs_trans        *tp,
+        struct xfs_buftarg      *target,
+        xfs_daddr_t             blkno,
+        int                     len)
+{
+        xfs_log_item_chunk_t    *licp;
+        xfs_log_item_desc_t     *lidp;
+        xfs_buf_log_item_t      *blip;
+        int                     i;
-STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
+        len = BBTOB(len);
-                xfs_daddr_t, int);
+        for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
-STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
+                if (xfs_lic_are_all_free(licp)) {
-                xfs_daddr_t, int);
+                        ASSERT(licp == &tp->t_items);
+                        ASSERT(licp->lic_next == NULL);
+                        return NULL;
+                }
+                for (i = 0; i < licp->lic_unused; i++) {
+                        /*
+                         * Skip unoccupied slots.
+                         */
+                        if (xfs_lic_isfree(licp, i))
+                                continue;
+                        lidp = xfs_lic_slot(licp, i);
+                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
+                        if (blip->bli_item.li_type != XFS_LI_BUF)
+                                continue;
+                        if (XFS_BUF_TARGET(blip->bli_buf) == target &&
+                            XFS_BUF_ADDR(blip->bli_buf) == blkno &&
+                            XFS_BUF_COUNT(blip->bli_buf) == len)
+                                return blip->bli_buf;
+                }
+        }
+        return NULL;
+}
 /*
 * Add the locked buffer to the transaction.
@@ -112,14 +152,6 @@ xfs_trans_bjoin(
 * within the transaction, just increment its lock recursion count
 * and return a pointer to it.
 *
- * Use the fast path function xfs_trans_buf_item_match() or the buffer
- * cache routine incore_match() to find the buffer
- * if it is already owned by this transaction.
- *
- * If we don't already own the buffer, use get_buf() to get it.
- * If it doesn't yet have an associated xfs_buf_log_item structure,
- * then allocate one and add the item to this transaction.
- *
 * If the transaction pointer is NULL, make this just a normal
 * get_buf() call.
 */
@@ -149,11 +181,7 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
         * have it locked.  In this case we just increment the lock
         * recursion count and return the buffer to the caller.
         */
-        if (tp->t_items.lic_next == NULL) {
+        bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
-                bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
-        } else {
-                bp  = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
-        }
        if (bp != NULL) {
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
                if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
@@ -259,14 +287,6 @@ int	xfs_error_mod = 33;
 * within the transaction and already read in, just increment its
 * lock recursion count and return a pointer to it.
 *
- * Use the fast path function xfs_trans_buf_item_match() or the buffer
- * cache routine incore_match() to find the buffer
- * if it is already owned by this transaction.
- *
- * If we don't already own the buffer, use read_buf() to get it.
- * If it doesn't yet have an associated xfs_buf_log_item structure,
- * then allocate one and add the item to this transaction.
- *
 * If the transaction pointer is NULL, make this just a normal
 * read_buf() call.
 */
@@ -328,11 +348,7 @@ xfs_trans_read_buf(
         * If the buffer is not yet read in, then we read it in, increment
         * the lock recursion count, and return it to the caller.
         */
-        if (tp->t_items.lic_next == NULL) {
+        bp = xfs_trans_buf_item_match(tp, target, blkno, len);
-                bp = xfs_trans_buf_item_match(tp, target, blkno, len);
-        } else {
-                bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
-        }
        if (bp != NULL) {
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
                ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -696,7 +712,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
        tp->t_flags |= XFS_TRANS_DIRTY;
        lidp->lid_flags |= XFS_LID_DIRTY;
-        lidp->lid_flags &= ~XFS_LID_BUF_STALE;
        bip->bli_flags |= XFS_BLI_LOGGED;
        xfs_buf_item_log(bip, first, last);
 }
@@ -782,7 +797,7 @@ xfs_trans_binval(
        bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
        memset((char *)(bip->bli_format.blf_data_map), 0,
              (bip->bli_format.blf_map_size * sizeof(uint)));
-        lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE;
+        lidp->lid_flags |= XFS_LID_DIRTY;
        tp->t_flags |= XFS_TRANS_DIRTY;
 }
@@ -902,111 +917,3 @@ xfs_trans_dquot_buf(
        bip->bli_format.blf_flags |= type;
 }
-/*
- * Check to see if a buffer matching the given parameters is already
- * a part of the given transaction.  Only check the first, embedded
- * chunk, since we don't want to spend all day scanning large transactions.
- */
-STATIC xfs_buf_t *
-xfs_trans_buf_item_match(
-        xfs_trans_t     *tp,
-        xfs_buftarg_t   *target,
-        xfs_daddr_t     blkno,
-        int             len)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_desc_t     *lidp;
-        xfs_buf_log_item_t      *blip;
-        xfs_buf_t               *bp;
-        int                     i;
-        bp = NULL;
-        len = BBTOB(len);
-        licp = &tp->t_items;
-        if (!xfs_lic_are_all_free(licp)) {
-                for (i = 0; i < licp->lic_unused; i++) {
-                        /*
-                         * Skip unoccupied slots.
-                         */
-                        if (xfs_lic_isfree(licp, i)) {
-                                continue;
-                        }
-                        lidp = xfs_lic_slot(licp, i);
-                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
-                        if (blip->bli_item.li_type != XFS_LI_BUF) {
-                                continue;
-                        }
-                        bp = blip->bli_buf;
-                        if ((XFS_BUF_TARGET(bp) == target) &&
-                            (XFS_BUF_ADDR(bp) == blkno) &&
-                            (XFS_BUF_COUNT(bp) == len)) {
-                                /*
-                                 * We found it.  Break out and
-                                 * return the pointer to the buffer.
-                                 */
-                                break;
-                        } else {
-                                bp = NULL;
-                        }
-                }
-        }
-        return bp;
-}
-/*
- * Check to see if a buffer matching the given parameters is already
- * a part of the given transaction.  Check all the chunks, we
- * want to be thorough.
- */
-STATIC xfs_buf_t *
-xfs_trans_buf_item_match_all(
-        xfs_trans_t     *tp,
-        xfs_buftarg_t   *target,
-        xfs_daddr_t     blkno,
-        int             len)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_desc_t     *lidp;
-        xfs_buf_log_item_t      *blip;
-        xfs_buf_t               *bp;
-        int                     i;
-        bp = NULL;
-        len = BBTOB(len);
-        for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
-                if (xfs_lic_are_all_free(licp)) {
-                        ASSERT(licp == &tp->t_items);
-                        ASSERT(licp->lic_next == NULL);
-                        return NULL;
-                }
-                for (i = 0; i < licp->lic_unused; i++) {
-                        /*
-                         * Skip unoccupied slots.
-                         */
-                        if (xfs_lic_isfree(licp, i)) {
-                                continue;
-                        }
-                        lidp = xfs_lic_slot(licp, i);
-                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
-                        if (blip->bli_item.li_type != XFS_LI_BUF) {
-                                continue;
-                        }
-                        bp = blip->bli_buf;
-                        if ((XFS_BUF_TARGET(bp) == target) &&
-                            (XFS_BUF_ADDR(bp) == blkno) &&
-                            (XFS_BUF_COUNT(bp) == len)) {
-                                /*
-                                 * We found it.  Break out and
-                                 * return the pointer to the buffer.
-                                 */
-                                return bp;
-                        }
-                }
-        }
-        return NULL;
-}
author	NeilBrown <neilb@suse.de>	2010-05-21 18:31:36 -0400
committer	NeilBrown <neilb@suse.de>	2010-05-21 18:31:36 -0400
commit	19fdb9eefb21b72edbc365b838502780c392bad6 (patch)
tree	deae04c48532d6eab64ed4b0396737bb854b5506 /fs
parent	be6800a73aa2f3dc14744c3b80e676d189789f04 (diff)
parent	3ff195b011d7decf501a4d55aeed312731094796 (diff)